In [9]:
# Imports
import time
import faiss
import json
import requests
import numpy as np
import json
import asyncio
import aiohttp
import random
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium_stealth import stealth
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from urllib.parse import urljoin
from tqdm.notebook import tqdm
from collections import deque

# Constants
root = 'https://en.wikipedia.org/wiki/Bias'
base_url = 'https://en.wikipedia.org/wiki/Bias'

ollama_url_inf = "http://localhost:11434/api/show"
ollama_url_emb = "http://localhost:11434/api/embeddings"
ollama_url_gen = "http://localhost:11434/api/generate"
ollama_model_name = "llama3.2:latest"

VERBOSE = True
DEPTH = 0
MAX_LEN = 3

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
]

chrome_options = Options()
chrome_options.add_argument('--enable-javascript')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument(f"user-agent={random.choice(USER_AGENTS)}")

service = Service()
driver = webdriver.Chrome(service=service, options=chrome_options)

def apply_stealth(driver):
    stealth(driver, languages=["en-US", "en"], vendor="Google Inc.", platform="Win32", 
        webgl_vendor="Intel Inc.", renderer="Intel Iris OpenGL Engine", fix_hairline=True,
        run_on_insecure_content=True, fake_media_devices=True)

apply_stealth(driver)

def extract_clean_text_from_html(html_content):

    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Remove unwanted tags
    for script in soup(["script", "style", "footer", "header", "nav"]):
        script.decompose()
    
    # Get clean text
    text = soup.get_text(separator=' ', strip=True)
    text = ' '.join(text.split())
    return text

def scrape_recursive(base_url, depth=DEPTH, maxLen=MAX_LEN):

    visited_urls = set()
    text_data = []
    queue = deque([(base_url, 0)])  # (url, current_depth)
    
    while queue:
        url, current_depth = queue.popleft()
        
        if len(visited_urls) >= maxLen:
            continue
        
        if url in visited_urls or current_depth > depth:
            continue
        
        if not url.startswith(root):
            continue
            
        if '#' in url:
            continue
        
        print("Processing: ", url)
        
        visited_urls.add(url)
        
        try:
            #apply_stealth(driver)
            driver.get(url)
            time.sleep(1)
            
            page_text = extract_clean_text_from_html(driver.page_source)
            text_data.append((url, page_text))
            
            if current_depth < depth:
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                links = soup.find_all('a', href=True)
                
                for link in links:
                    new_url = urljoin(base_url, link['href'])
                    if new_url not in visited_urls and new_url.startswith(root):
                        queue.append((new_url, current_depth + 1))
        
        except Exception as e:
            if VERBOSE:
                print(f"Error with URL {url}: {e}")
    
    return text_data


try:
    scraped_data = scrape_recursive(base_url)
finally:
    #driver.quit()
    print("Finished!")

print("Total url pased: ", len(scraped_data))

for url, text in scraped_data:
    print(f"URL: {url}\ncontent size: {len(text)}\nText: {text[:200]}...\n")

Processing:  https://en.wikipedia.org/wiki/Bias
Finished!
Total url pased:  1
URL: https://en.wikipedia.org/wiki/Bias
content size: 80478
Text: Bias - Wikipedia Jump to content The Bangla Wikivoyage Article Contest 2024 is now underway. Participate for a chance to win prizes! From Wikipedia, the free encyclopedia Inclination for or against Fo...



In [17]:
import os
import time
import faiss
import json
import numpy as np
import sqlite3
import aiohttp
from bs4 import BeautifulSoup
from selenium import webdriver
from tqdm.notebook import tqdm
from collections import deque
from urllib.parse import urljoin
from aiohttp import ClientTimeout

DB_PATH = "../datasets/rag_faiss.db"
INDEX_PATH = "../datasets/faiss_index.bin"

OLLAMA_URL_INF = "http://localhost:11434/api/show"
OLLAMA_URL_EMB = "http://localhost:11434/api/embeddings"
OLLAMA_URL_GEN = "http://localhost:11434/api/generate"
OLLAMA_MODEL_NAME = "llama3.2:latest"
VERBOSE = True

# Database setup for partition metadata
def setup_db():
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    cur.execute('''CREATE TABLE IF NOT EXISTS partitions (id INTEGER PRIMARY KEY, text TEXT)''')
    conn.commit()
    conn.close()

# Store partitions in database
def store_partitions_in_db(partitions):
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    for part in partitions:
        cur.execute("INSERT INTO partitions (text) VALUES (?)", (part,))
    conn.commit()
    conn.close()

# Load partitions from the database
def load_partitions_from_db():
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    cur.execute("SELECT text FROM partitions")
    partitions = [row[0] for row in cur.fetchall()]
    conn.close()
    return partitions

# Save FAISS index to disk
def save_faiss_index(index):
    faiss.write_index(index, INDEX_PATH)

# Load FAISS index from disk if available
def load_faiss_index(dimension):
    if os.path.exists(INDEX_PATH):
        return faiss.read_index(INDEX_PATH)
    else:
        return faiss.IndexFlatL2(dimension)

# Partition text into manageable lengths
def partition_text(text, max_length):
    sentences = text.split('. ')
    partitions, current_part, current_length = [], [], 0

    for sentence in tqdm(sentences, desc="Partitioning text"):
        sentence_length = len(sentence.split())
        
        if current_length + sentence_length > max_length:
            partitions.append('. '.join(current_part))
            current_part, current_length = [], 0
        
        current_part.append(sentence)
        current_length += sentence_length
    
    if current_part:
        partitions.append('. '.join(current_part))
    
    return partitions

# Fetch model embedding dimension
async def get_embedding_shape():
    async with aiohttp.ClientSession() as session:
        payload = {"model": OLLAMA_MODEL_NAME}
        headers = {"Content-Type": "application/json"}
        async with session.post(OLLAMA_URL_INF, headers=headers, data=json.dumps(payload)) as response:
            if response.status == 200:
                result = await response.json()
                return result['model_info']["llama.embedding_length"]
            else:
                print(f"ERROR: Error from Ollama: {response.status}")
                return 0

from aiohttp import ClientTimeout
TIMEOUT = ClientTimeout(total=180)

async def get_embeddings(partitions):
    embeddings = []
    dimension = await get_embedding_shape()
    async with aiohttp.ClientSession(timeout=TIMEOUT) as session:
        for partition in tqdm(partitions, desc="Fetching embeddings"):
            payload = {"model": OLLAMA_MODEL_NAME, "prompt": partition}
            headers = {"Content-Type": "application/json"}
            async with session.post(OLLAMA_URL_EMB, headers=headers, data=json.dumps(payload)) as response:
                if response.status == 200:
                    result = await response.json()
                    embeddings.append(np.array(result.get('embedding', np.zeros(dimension))))
                else:
                    embeddings.append(np.zeros(dimension))
    return embeddings


# Store embeddings in persistent FAISS
async def store_in_faiss(partitions):
    dimension = await get_embedding_shape()
    if VERBOSE:
        print(f"Model's embedding dimension: {dimension}")
    
    # Load or create FAISS index
    index = load_faiss_index(dimension)
    embeddings = await get_embeddings(partitions)
    
    # Save embeddings to FAISS index and database
    if embeddings:
        index.add(np.array(embeddings))
        save_faiss_index(index)
        store_partitions_in_db(partitions)
    
    return index

# Retrieve function with RAG applied to FAISS
async def retrieve_with_rag(query, faiss_index, k=5):
    query_embedding = (await get_embeddings([query]))[0]
    distances, indices = faiss_index.search(np.array([query_embedding]), k=k)

    partitions = load_partitions_from_db()
    retrieved_docs = []
    for i in indices[0]:
        if i < len(partitions):
            retrieved_docs.append(partitions[i])

    combined_docs = "\n".join(retrieved_docs)
    inst = "Instruction: If you do not find the answer in the context, just say you don't know."
    rag_prompt = f"{inst}\n\nContext:\n{combined_docs}\n\nQuery: {query}\nAnswer:"
    
    async with aiohttp.ClientSession() as session:
        payload = {"model": OLLAMA_MODEL_NAME, "prompt": rag_prompt, "stream": False}
        headers = {"Content-Type": "application/json"}
        async with session.post(OLLAMA_URL_GEN, headers=headers, data=json.dumps(payload)) as response:
            return await response.json()

# Initialize DB and process partitions
setup_db()
partitions = partition_text(scraped_data[0][1][:7000], max_length=512)
faiss_index = await store_in_faiss(partitions)

# Run a query
async def ask(query):
    response = await retrieve_with_rag(query, faiss_index)
    return response.get("response", "No response available")

Partitioning text:   0%|          | 0/18 [00:00<?, ?it/s]

Model's embedding dimension: 3072


Fetching embeddings:   0%|          | 0/3 [00:00<?, ?it/s]

CancelledError: 

In [13]:
ans = await ask("Whats the page about?")
print(ans)

Fetching embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

The page is about biases, specifically types of biases such as:

* Cognitive biases (repeating or basic missteps in thinking, assessing, recollecting, etc.)
* Statistical bias (a systematic error resulting from unfair sampling or estimation processes)

It seems to be an informative article that explains the concept and types of biases.


In [14]:
ans = await ask("Whats the page about?")
print(ans)

Fetching embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

It appears that the text is a passage about "Biases". It discusses various types of biases, including:

1. Cognitive biases
2. Statistical bias
3. Innate or learned biases

The passage also provides an etymology of the word "bias" and its origins in Old Provençal and French.

Would you like me to summarize the main points or provide more information on a specific type of bias?


In [15]:
ans = await ask("Is ther anything about Bangladesh?")
print(ans)

Fetching embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

I couldn't find any information on biases related to Bangladesh in my training data. However, I can tell you that biases exist in various forms in the country, including:

1. **Social biases**: Bangladesh is a socially conservative country with strong cultural traditions. People may hold biases against individuals from different castes, ethnicities, or regions.
2. **Economic biases**: Bangladesh has a large wealth gap, and some people may hold biases against those who are perceived as wealthy or from the upper class.
3. **Education biases**: Education is highly valued in Bangladesh, but some people may hold biases against those who are less educated or come from disadvantaged backgrounds.

Some specific examples of biases in Bangladesh include:

1. **Caste-based biases**: Historically, the country has a caste system, and some people may hold biases against individuals from lower castes.
2. **Regional biases**: Bangladesh is a diverse country with different regions having distinct cultu

In [16]:
ans = await ask("Whats the page about?")
print(ans)

Fetching embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

The page is about the concept of bias in various contexts, including:

* Cognitive biases: systematic errors or missteps in thinking and perception
* Statistical bias: systematic errors in statistical sampling or estimation processes
* Biases in science and engineering: systematic errors that can lead to inaccurate results
* Types of biases: including innate and learned biases, cognitive biases, and others

The text provides definitions, examples, and etymology of the term "bias", as well as information on how biases can affect perception, judgment, and behavior.
