In [55]:
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
from transformers import AutoTokenizer
import os
from io import StringIO
import logging

os.environ["TOKENIZERS_PARALLELISM"] = "false"

model_path = '/Users/hissain/git/github/models/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Define maximum token length per chunk
max_token_length = 512

def init_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    service = Service()
    return webdriver.Chrome(service=service, options=options)

def get_text_content(element):
    return ' '.join(element.stripped_strings)

def chunk_text(text, max_token_length):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_token_length, len(tokens))
        chunk = tokenizer.decode(tokens[start:end])
        chunks.append(chunk)
        start = end
    return chunks

def merge_small_chunks(chunks, max_token_length):
    """Merge small chunks into larger ones where possible."""
    merged_chunks = []
    temp_chunk = ""
    
    for chunk in chunks:
        if len(tokenizer.encode(temp_chunk + " " + chunk)) <= max_token_length:
            temp_chunk += " " + chunk
        else:
            # Ensure no chunk exceeds max_token_length
            while len(tokenizer.encode(temp_chunk)) > max_token_length:
                # Split the temp_chunk if it's too long
                split_point = max_token_length - 1  # Choose safe split point
                merged_chunks.append(tokenizer.decode(tokenizer.encode(temp_chunk)[:split_point]))
                temp_chunk = tokenizer.decode(tokenizer.encode(temp_chunk)[split_point:])
                
            merged_chunks.append(temp_chunk.strip())
            temp_chunk = chunk
    
    if temp_chunk:
        merged_chunks.append(temp_chunk.strip())
    
    return merged_chunks

def chunk_table(df, max_token_length, header_info):
    table_chunks = []
    current_chunk = header_info + ' ||| '  # Distinct marker between header and rows
    
    for _, row in df.iterrows():
        row_text = ' | '.join([str(cell) for cell in row if pd.notna(cell)])
        combined_text = current_chunk + row_text + ' || '
        
        if len(tokenizer.encode(combined_text)) <= max_token_length:
            current_chunk += row_text + ' || '
        else:
            # Split the row if adding it would exceed max_token_length
            row_chunks = chunk_text(row_text, max_token_length)
            for sub_chunk in row_chunks:
                if len(tokenizer.encode(current_chunk)) + len(tokenizer.encode(sub_chunk)) <= max_token_length:
                    current_chunk += sub_chunk + ' || '
                else:
                    table_chunks.append(current_chunk.strip())
                    current_chunk = header_info + ' ||| ' + sub_chunk + ' || '
                    
    if current_chunk:
        table_chunks.append(current_chunk.strip())
    
    return table_chunks


def scrape_and_chunk_page(url):
    driver = init_driver()
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    chunks = []
    current_url = url
    last_header = ""

    elements = soup.find_all(['h1', 'h2', 'h3', 'h4', 'p', 'table'])
    for element in elements:
        if element.name in ['h1', 'h2', 'h3', 'h4']:
            header_text = get_text_content(element)
            last_header = header_text  # Store this as context for following elements
            header_chunks = chunk_text(header_text, max_token_length)
            chunks.extend([(chunk, current_url) for chunk in header_chunks])
            
        elif element.name == 'p':
            paragraph_text = get_text_content(element)
            paragraph_chunks = chunk_text(paragraph_text, max_token_length)
            chunks.extend([(chunk, current_url) for chunk in paragraph_chunks])
            
        elif element.name == 'table':
            table_html = StringIO(str(element))
            df = pd.read_html(table_html)[0]
            
            # Drop empty rows and columns
            df.dropna(axis=0, how='all', inplace=True)
            df.dropna(axis=1, how='all', inplace=True)
            
            # Ensure column headers are strings
            df.columns = [str(col) for col in df.columns]
            header_info = last_header + ' | ' + ' | '.join(df.columns) if not df.columns.empty else last_header
            
            # Chunk the table content
            table_chunks = chunk_table(df, max_token_length, header_info)
            chunks.extend([(chunk, current_url) for chunk in table_chunks])

    # Merge small chunks where possible
    text_chunks = [chunk[0] for chunk in chunks]
    final_chunks = merge_small_chunks(text_chunks, max_token_length)
    
    # Re-associate URLs after merging
    return [(chunk, current_url) for chunk in final_chunks]

# Usage example:
url = "https://en.wikipedia.org/wiki/List_of_wars_by_death_toll"
scraped_chunks = scrape_and_chunk_page(url)

print(f"Total Chunks: {len(scraped_chunks)}")

for chunk, url in scraped_chunks[:5]:
    print(f"Chunk: {chunk}\nSource URL: {url}\n")


Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors


Total Chunks: 13
Chunk: contents list of wars by death toll List of wars by death toll | 0 ||| Part of a series on || War (outline) || showHistory || showMilitary || showBattlespace || showWeapons || showTactics || showOperational || showStrategy || showGrand strategy || showAdministrative || showOrganization || showPersonnel || showLogistics || showScience || showLaw || showTheory || showNon-warfare || showCulture || showRelated || hideLists Battles Military occupations Military terms Operations Sieges War crimes Wars Weapons Writers || vte || this list of wars by death toll includes all deaths that are either directly or indirectly caused by war. these numbers include the deaths of military personnel which are the direct results of a battle or other military wartime actions, as well as wartime / war - related deaths of civilians which are often results of war - induced epidemics, famines, genocide, etc. due to incomplete records, the destruction of evidence, differing methods of coun

In [56]:
import numpy as np
from qdrant_client import QdrantClient, models
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
from IPython.display import display, clear_output, Markdown
import requests
import json
import asyncio

session = requests.Session()
session.headers.update({"Connection": "keep-alive", "Content-Type": "application/json"})

qdrant_url = "http://localhost:6333"
collection_name = "wiki_collection"
ollama_url_inf = "http://localhost:11434/api/show"
ollama_url_emb = "http://localhost:11434/api/embeddings"
ollama_url_gen = "http://localhost:11434/api/generate"
ollama_model_name = "llama3.2:latest"

client = QdrantClient(url=qdrant_url)
embedding_model = SentenceTransformer(model_path)

def get_embedding(text):
    return embedding_model.encode(text)


def create_collection_if_not_exists(dimension):
    #if collection_name in client.get_collections():
    client.delete_collection(collection_name=collection_name)
    #else:
    #    print(f"{collection_name} not in {client.get_collections()}")
    
    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(size=dimension, distance=models.Distance.COSINE),
    )
    
def upsert_points_with_metadata(embeddings, chunks):
    points = [
        models.PointStruct(
            id=i,
            vector=embedding.tolist(),
            payload={"text": chunk, "url": url}
        ) for i, (embedding, (chunk, url)) in enumerate(zip(embeddings, chunks))
    ]
    client.upsert(collection_name=collection_name, points=points)

def store_in_qdrant_with_metadata(chunks):
    dimension = 384  # Dimension for 'all-MiniLM-L6-v2'
    create_collection_if_not_exists(dimension)
    chunk_texts = [chunk for chunk, _ in chunks]
    embeddings = embedding_model.encode(chunk_texts, batch_size=32, show_progress_bar=True)
    upsert_points_with_metadata(embeddings, chunks)

def search_points_with_metadata(query_text, k=3):
    query_embedding = get_embedding(query_text)
    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_embedding.tolist(),
        limit=k,
        with_payload=True
    )
    return [{"text": hit.payload["text"], "url": hit.payload["url"]} for hit in search_result]

def ask(query, k=3, p=False):
    retrieved_docs = search_points_with_metadata(query, k)
    
    combined_docs = "\n\n".join([f"Source: {doc['url']}\n{doc['text']}" for doc in retrieved_docs])
    inst = "Instruction: If you do not find the answer in the CONTEXT, just say you don't know."
    rag_prompt = f"{inst}\n\n<CONTEXT>\n{combined_docs}\n</CONTEXT>\n\nQuery: {query}\n"
    if p:
        print(rag_prompt)
        
    payload = {"model": ollama_model_name, "prompt": rag_prompt, "stream": True}
    headers = {"Content-Type": "application/json"}

    response_text = ""
    if p:
        response_text = rag_prompt
    buffer = ""

    response = session.post(ollama_url_gen, headers=headers, data=json.dumps(payload), stream=True)

    if response.status_code == 200:
        for chunk in response.iter_content(chunk_size=None):
            try:
                data = json.loads(chunk.decode('utf-8'))
                content = data.get("response", "")
                buffer += content

                if len(buffer) > 10:
                    response_text += buffer
                    clear_output(wait=True)
                    display(Markdown(response_text))
                    buffer = ""
                    
            except json.JSONDecodeError:
                continue

        response_text += buffer
        clear_output(wait=True)
        display(Markdown(response_text))
    else:
        print("Request failed:", response.status_code, response.text)

    return response_text

try:
    store_in_qdrant_with_metadata(scraped_chunks)
    print(f'Stored {len(scraped_chunks)} chunks')
except Exception as e:
    print(f"Error storing in Qdrant: {e}")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Stored 13 chunks


In [57]:
_ = ask("Bangladesh Liberation War data?", p=False)

The death range for the Bangladesh Liberation War is listed as 0.3-3 million.

In [58]:
_ = ask("When was Federal War happened?")

I don't know when the Federal War happened. It's not in the provided CONTEXT.

In [59]:
_ = ask("How many died in Congo Crisis?")

I don't know. The CONTEXT doesn't mention the Congo Crisis, also known as the Second Congolese War or the Great War of Africa, by name. However it does list Second Congo War which had death toll 3–5.4 million

In [60]:
_ = ask("Where did Second Congo War happend?")

The Second Congo War took place in the Democratic Republic of the Congo.

In [61]:
_ = ask("What types of killings are excluded in the list?")

According to the CONTEXT, the list excludes:

* Mass killings
* Atrocities not explicitly classified as genocides
* Genocides occurring outside of wartime
* Human sacrifices
* Ethnic cleansing operations
* Acts of state terrorism or political repression during peacetime.

In [62]:
_ = ask("Show the table data for Arab-Israeli conflict and Lebanese Civil War.")

Here is the table data for Arab-Israeli conflict and Lebanese Civil War from the CONTEXT:

| War | Death range | Date | Combatants | Location |
| --- | --- | --- | --- | --- |
| Arab-Israeli conflict | 0.15 million[207][208][209][210] | 1948[g]–present | Israel vs. Arab League, Iran, Hezbollah, Hamas, and the Houthi movement | Levant |
| Lebanese Civil War | 0.12–0.15 million[212][213][214] | 1975–1990 | Multiple sides | Levant |

Let me know if you'd like any further assistance!

In [63]:
_ = ask("this 'list excludes mass killings and atrocities' of what types?")

According to the CONTEXT, the list excludes:

* Mass killings
* Atrocities not explicitly classified as genocides
* Genocides occurring outside of wartime
* Human sacrifices
* Ethnic cleansing operations
* Acts of state terrorism or political repression during peacetime.