In [12]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from urllib.parse import urljoin
import time
import faiss
import numpy as np
from tqdm.notebook import tqdm
import json

# Configure Selenium Chrome options
chrome_options = Options()
chrome_options.add_argument('--headless')  # Run in headless mode
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Initialize the Chrome driver
service = Service()
driver = webdriver.Chrome(service=service, options=chrome_options)

def get_visitable_links(base_url):
    """Scrapes all valid links from the base_url."""
    driver.get(base_url)
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    links = []
    for a_tag in soup.find_all('a', href=True):
        full_url = urljoin(base_url, a_tag['href'])
        if '#' not in full_url and '%' not in full_url and full_url.startswith(base_url) and full_url not in links:
            links.append(full_url)
    return links

def scrape_text_from_url(url):
    """Scrapes the textual content from a given URL."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve {url}")
        return ""
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Use get_text() without strip=True to preserve spaces between inline elements
    text_content = ' '.join([p.get_text() for p in soup.find_all('p')])
    
    # Optionally, normalize excessive whitespace
    text_content = ' '.join(text_content.split())
    
    return text_content

def partition_text(text, max_length=512):
    """Partitions the text into smaller parts to feed into RAG."""
    sentences = text.split('. ')
    partitions = []
    current_part = []
    current_length = 0
    
    for sentence in sentences:
        current_length += len(sentence.split())
        current_part.append(sentence)
        
        if current_length > max_length:
            partitions.append('. '.join(current_part))
            current_part = []
            current_length = 0

    if current_part:
        partitions.append('. '.join(current_part))

    return partitions

def store_in_faiss(partitions, embedding_func):
    """Stores document embeddings in a FAISS index."""
    dimension = 768  # Assuming LLaMA returns 768-dim embeddings (adjust based on model)
    index = faiss.IndexFlatL2(dimension)  # L2 distance

    doc_vectors = []
    doc_ids = []
    
    for i, partition in enumerate(partitions):
        embedding = embedding_func(partition)
        index.add(np.array([embedding]))  # Add to FAISS index
        doc_vectors.append(embedding)
        doc_ids.append(i)
    
    return index, doc_ids

def get_embedding(text):
    """Gets embedding for the text using Ollama (LLaMA 3.2)."""
    payload = {
        "model": "llama3.2",
        "prompt": text,
        "max_tokens": 0,  # Set 0 if we only want embeddings
    }
    
    headers = {"Content-Type": "application/json"}
    response = requests.post(ollama_url, headers=headers, data=json.dumps(payload))
    
    if response.status_code == 200:
        result = response.json()
        return np.array(result['embedding'])  # Assuming embeddings are returned as 'embedding'
    else:
        print(f"Error from Ollama: {response.status_code}")
        return np.zeros(768)  # Return zero vector on error (adjust dimension based on model)

# Main logic to scrape and partition data
base_url = 'https://en.wikipedia.org/wiki/Bangladesh'
valid_links = get_visitable_links(base_url)

print(f"Found {len(valid_links)} valid links. Scraping content...")

# Scrape text content from all valid links
all_text = ""
for link in tqdm(valid_links):
    text_content = scrape_text_from_url(link)
    all_text += text_content + "\n\n"

Found 58 valid links. Scraping content...


  0%|          | 0/58 [00:00<?, ?it/s]

In [13]:
# Partition the text
partitions = partition_text(all_text, max_length=512)

print(f'partition length: {len(partitions)}')

partition length: 235


In [14]:
len(all_text)

838543

In [18]:
partitions[3]

'The Ganges unites with the Jamuna (main channel of the Brahmaputra) and later joins the Meghna, finally flowing into the Bay of Bengal. Bangladesh is called the "Land of Rivers",[50] as it is home to over 57 trans-boundary rivers, the most of any nation-state. Water issues are politically complicated since Bangladesh is downstream of India.[51] Bangladesh is predominantly rich fertile flat land. Most of it is less than 12 m (39 ft) above sea level, and it is estimated that about 10% of its land would be flooded if the sea level were to rise by 1 m (3.3 ft).[52] 12% of the country is covered by hill systems. The country\'s haor wetlands are of significance to global environmental science. The highest point in Bangladesh is the Saka Haphong, located near the border with Myanmar, with an elevation of 1,064 m (3,491 ft).[53] Previously, either Keokradong or Tazing Dong were considered the highest. In Bangladesh forest cover is around 14% of the total land area, equivalent to 1,883,400 hec

In [38]:
ollama_url = "http://localhost:11434/api/embeddings"

def get_embedding(text):
    payload = {
        "model": "llama3.2:latest",
        "prompt": text
    }
    
    headers = {"Content-Type": "application/json"}
    response = requests.post(ollama_url, headers=headers, data=json.dumps(payload))
    
    if response.status_code == 200:
        result = response.json()
        return np.array(result['embedding'])  # Assuming embeddings are returned as 'embedding'
    else:
        print(f"Error from Ollama: {response.status_code}")
        return np.zeros(768)  # Return zero vector on error (adjust dimension based on model)

In [33]:
#get_embedding("Hello Worlds")

In [34]:
# Store the partitions in FAISS
faiss_index, doc_ids = store_in_faiss(partitions, embedding_func=get_embedding)

b'{"embedding":[1.3492603302001953,-1.7394051551818848,1.8865591287612915,-1.8877288103103638,-0.2721545398235321,-2.8973348140716553,1.1552950143814087,1.1901724338531494,-0.15606293082237244,0.04916686937212944,-3.4736690521240234,1.7532808780670166,1.6012792587280273,2.2840280532836914,0.09682858735322952,-1.724705457687378,1.5661413669586182,-1.0167689323425293,-2.3075103759765625,2.0169637203216553,-1.5915762186050415,-0.5994940400123596,1.3451006412506104,-1.7665663957595825,-2.209132194519043,-1.16912043094635,1.9079135656356812,-4.336772918701172,-0.06590162217617035,-0.9006782174110413,1.652856707572937,0.24669378995895386,0.3968881666660309,-1.414749264717102,-1.2342872619628906,-1.2610446214675903,-1.6042582988739014,1.1051777601242065,0.9391537308692932,2.6191537380218506,0.5302271842956543,-1.2857186794281006,2.415353298187256,-0.17404820024967194,-2.1590816974639893,0.20285190641880035,1.1469078063964844,1.3254128694534302,0.6604043245315552,1.3675076961517334,2.553591966

AssertionError: 

In [36]:
def retrieve_with_rag(query, faiss_index, doc_ids):
    query_embedding = get_embedding(query)
    print(f'embd len: {len(query_embedding)}')
    
    # Search in FAISS index
    D, I = faiss_index.search(np.array([query_embedding]), k=5)  # Retrieve top-5 closest documents
    
    print(D, I)
    
    # For simplicity, concatenate the closest documents and feed to RAG
    retrieved_docs = []
    for i in I[0]:
        doc_id = doc_ids[i]
        retrieved_docs.append(partitions[doc_id])  # Get the document text from partition list
    
    combined_docs = "\n".join(retrieved_docs)
    
    # Perform RAG with retrieved docs
    rag_prompt = f"Context:\n{combined_docs}\n\nQuery: {query}\nAnswer:"
    print(rag_prompt)
    
    response = requests.post(ollama_url, headers={"Content-Type": "application/json"},
                             data=json.dumps({"model": "llama3.2", "prompt": rag_prompt}))
    
    if response.status_code == 200:
        return response.json()["completion"]
    else:
        return f"Error from Ollama: {response.status_code}"

In [39]:
# Query and retrieval with RAG
query = "Who is the father of Bangladesh?"
rag_response = retrieve_with_rag(query, faiss_index, doc_ids)

print(f"RAG Response:\n{rag_response}")

# Close the Selenium browser
#driver.quit()

embd len: 3072


AssertionError: 