In [37]:
import os
import re
import pickle
import pandas as pd

from io import StringIO
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from transformers import AutoTokenizer

os.environ["TOKENIZERS_PARALLELISM"] = "false"

#model_path = '/Users/hissain/git/github/models/all-MiniLM-L6-v2'
model_path = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_path, force_download=True)

# Define maximum token length per chunk
max_token_length = 200

def clean(text):
    clean_text = re.sub(r'\[\s*\d+\s*\]', '', text)
    return clean_text

def get_text_content(element):
    return ' '.join(str(e) for e in element.stripped_strings)

def chunk_text(text, max_token_length):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_token_length, len(tokens))
        chunk = tokenizer.decode(tokens[start:end])
        chunks.append(chunk)
        start = end
    return chunks

def merge_small_chunks(chunks, max_token_length):
    merged_chunks = []
    temp_chunk = ""
    
    for chunk in chunks:
        if len(tokenizer.encode(temp_chunk + " " + chunk)) <= max_token_length:
            temp_chunk += " " + chunk
        else:
            while len(tokenizer.encode(temp_chunk)) > max_token_length:
                split_point = max_token_length - 1
                merged_chunks.append(tokenizer.decode(tokenizer.encode(temp_chunk)[:split_point]))
                temp_chunk = tokenizer.decode(tokenizer.encode(temp_chunk)[split_point:])
                
            merged_chunks.append(temp_chunk.strip())
            temp_chunk = chunk
    
    if temp_chunk:
        merged_chunks.append(temp_chunk.strip())
    
    return merged_chunks

def chunk_table(df, max_token_length, header_info):
    table_chunks = []
    current_chunk = header_info + ' ||| '
    
    for _, row in df.iterrows():
        row_text = ' | '.join([str(cell) for cell in row if pd.notna(cell)])
        row_text = clean(row_text)
        combined_text = current_chunk + row_text + ' || '
        
        if len(tokenizer.encode(combined_text)) <= max_token_length:
            current_chunk += row_text + ' || '
        else:
            row_chunks = chunk_text(row_text, max_token_length)
            for sub_chunk in row_chunks:
                if len(tokenizer.encode(current_chunk)) + len(tokenizer.encode(sub_chunk)) <= max_token_length:
                    current_chunk += sub_chunk + ' || '
                else:
                    table_chunks.append(current_chunk.strip())
                    current_chunk = header_info + ' ||| ' + sub_chunk + ' || '
                    
    if current_chunk:
        table_chunks.append(current_chunk.strip())
    
    return table_chunks


def scrape_and_chunk_page(content):

    soup = BeautifulSoup(content[1], 'html.parser') # index-1 for html
    
    chunks = []
    current_url = content[0] #index-0 for url
    last_header = ""

    elements = soup.find_all(['h1', 'h2', 'h3', 'h4', 'p', 'table'])
    for element in elements:
        if element.name in ['h1', 'h2', 'h3', 'h4']:
            header_text = get_text_content(element)
            header_text = clean(header_text)
            last_header = "\nTable (" + header_text + "):"
            header_chunks = chunk_text(header_text, max_token_length)
            chunks.extend([(chunk, current_url) for chunk in header_chunks])
            
        elif element.name == 'p':
            paragraph_text = get_text_content(element)
            paragraph_text = clean(paragraph_text)
            paragraph_chunks = chunk_text(paragraph_text, max_token_length)
            chunks.extend([(chunk, current_url) for chunk in paragraph_chunks])
            
        elif element.name == 'table':
            table_html = StringIO(str(element))
            df = pd.read_html(table_html)[0]
            
            df.dropna(axis=0, how='all', inplace=True)
            df.dropna(axis=1, how='all', inplace=True)
            
            df.columns = [str(col) for col in df.columns]
            header_info = last_header + ' | ' + ' | '.join(df.columns) if not df.columns.empty else last_header
            
            table_chunks = chunk_table(df, max_token_length, header_info)
            chunks.extend([(chunk, current_url) for chunk in table_chunks])

    text_chunks = [chunk[0] for chunk in chunks]
    final_chunks = merge_small_chunks(text_chunks, max_token_length)
    
    return [(chunk, current_url) for chunk in final_chunks]

def scrape_and_chunk(html_contents):
    chunks = []
    for content in tqdm(html_contents, desc="Scraping pages"):
        chunks.extend(scrape_and_chunk_page(content))
    return chunks

with open("html_contents.pkl", "rb") as f:
    html_contents = pickle.load(f)

print(f"Loaded {len(html_contents)} URLs from pickle file")
scraped_chunks = scrape_and_chunk(html_contents)

print(f"Total Chunks: {len(scraped_chunks)}")

for chunk, url in scraped_chunks[:2]:
    print(f"Chunk: {chunk}\nSource URL: {url}\n")



tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Loaded 9 URLs from pickle file


Scraping pages:   0%|          | 0/9 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (580 > 512). Running this sequence through the model will result in indexing errors


Total Chunks: 1131
Chunk: Contents List of wars by death toll Table (List of wars by death toll): | 0 ||| Part of a series on || War (outline) || showHistory || showMilitary || showBattlespace || showWeapons || showTactics || showOperational || showStrategy || showGrand strategy || showAdministrative || showOrganization || showPersonnel || showLogistics || showScience || showLaw || showTheory || showNon-warfare || showCulture || showRelated || hideLists Battles Military occupations Military terms Operations Sieges War crimes Wars Weapons Writers || vte ||
Source URL: https://en.wikipedia.org/wiki/List_of_wars_by_death_toll

Chunk: This list of wars by death toll includes all deaths that are either directly or indirectly caused by war. These numbers include the deaths of military personnel which are the direct results of a battle or other military wartime actions, as well as wartime / war - related deaths of civilians which are often results of war - induced epidemics, famines, genocide

In [39]:
import spacy
from collections import Counter
import re
import numpy as np
from qdrant_client import QdrantClient, models
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
from IPython.display import display, clear_output, Markdown
import requests
import json
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from rank_bm25 import BM25Okapi

# Load SpaCy's English model for Named Entity Recognition
nlp = spacy.load("en_core_web_sm")

session = requests.Session()
retry = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.headers.update({"Connection": "keep-alive", "Content-Type": "application/json"})

qdrant_url = "http://localhost:6333"
collection_name = "wiki_collection"
ollama_url_gen = "http://localhost:11434/api/generate"
ollama_model_name = "llama3.2:latest"

client = QdrantClient(url=qdrant_url)

model_path_st = '/Users/hissain/git/github/models/all-MiniLM-L6-v2'
embedding_model = SentenceTransformer(model_path_st)

TOP_K = 20
TOP_N = 3
SYM_W = 0.8
SYN_W = 0.2
NE_BOOST_FACTOR = 1.2
NE_FULL_BOOST_FACTOR = 1.2

def get_embeddings(texts):
    return embedding_model.encode(texts, batch_size=32, show_progress_bar=True)

def create_collection(dimension):
    client.delete_collection(collection_name=collection_name)
    if not client.collection_exists(collection_name=collection_name):
        client.create_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams(size=dimension, distance=models.Distance.COSINE),
        )
    
def upsert_points_with_metadata(embeddings, chunks):
    points = [
        models.PointStruct(
            id=i,
            vector=embedding.tolist(),
            payload={"text": chunk, "url": url}
        ) for i, (embedding, (chunk, url)) in enumerate(zip(embeddings, chunks))
    ]
    client.upsert(collection_name=collection_name, points=points)

def store_in_qdrant_with_metadata(chunks):
    dimension = 384
    create_collection(dimension)
    chunk_texts = [chunk for chunk, _ in chunks]
    embeddings = get_embeddings(chunk_texts)
    upsert_points_with_metadata(embeddings, chunks)

def search_points_with_metadata(query_text, k=TOP_K):
    query_embedding = get_embeddings([query_text])[0]
    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_embedding.tolist(),
        limit=k,
        with_payload=True
    )
    return [{"text": hit.payload["text"], "url": hit.payload["url"], "score": hit.score} for hit in search_result]

def init_bm25(corpus_texts):
    tokenized_corpus = [text.split() for text in corpus_texts]
    return BM25Okapi(tokenized_corpus)

def calculate_bm25_scores(bm25, query_text):
    tokenized_query = query_text.split()
    return bm25.get_scores(tokenized_query)

def extract_named_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents]

def boost_ne_scores(query_text, docs, bm25_scores, boost_factor=NE_BOOST_FACTOR, full_match_boost=NE_FULL_BOOST_FACTOR):
    query_entities = extract_named_entities(query_text)
    print(f"Query Named Entities: {query_entities}")
    
    boosted_scores = []
    for idx, (doc, bm25_score) in enumerate(zip(docs, bm25_scores)):
        doc_entities = extract_named_entities(doc["text"])
        matching_ne_count = sum(1 for ne in query_entities if ne in doc_entities)
        full_match = all(ne in doc_entities for ne in query_entities)
        ne_boost = 1 + (boost_factor * matching_ne_count)
        if full_match:
            ne_boost *= full_match_boost
        boosted_scores.append(bm25_score * ne_boost)

    print(f"First-4 Boosted scores: {boosted_scores[:4]}")          
    return boosted_scores

#def calculate_boosted_scores(query_text, retrieved_docs, bm25):
#    bm25_scores = calculate_bm25_scores(bm25, query_text)
#    return boost_ne_scores(query_text, retrieved_docs, bm25_scores)

def get_top_n_chunks_by_combined_score(query_text, retrieved_docs, n=TOP_N, semantic_weight=SYM_W, keyword_weight=SYN_W):
    
    bm25 = init_bm25([doc["text"] for doc in retrieved_docs])
    bm25_scores = calculate_bm25_scores(bm25, query_text)
    boosted_keyword_scores = boost_ne_scores(query_text, retrieved_docs, bm25_scores)

    scored_chunks = []
    
    for idx, doc in enumerate(retrieved_docs):
        semantic_score = doc["score"]
        keyword_score = boosted_keyword_scores[idx]
        combined_score = (semantic_weight * semantic_score) + (keyword_weight * keyword_score)
        scored_chunks.append({"text": doc["text"], "url": doc["url"], "combined_score": combined_score})

    scored_chunks.sort(key=lambda n: n["combined_score"], reverse=True)
    print(f"Top-4 Combined scores: {[s['combined_score'] for s in scored_chunks[:4]]}")
    return scored_chunks[:n]

def search_points_with_metadata(query_text, k=TOP_K, n=TOP_N, semantic_weight=SYM_W, keyword_weight=SYN_W):
    query_embedding = get_embeddings([query_text])[0]
    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_embedding.tolist(),
        limit=k,
        with_payload=True
    )
    
    retrieved_docs = [{"text": hit.payload["text"], "url": hit.payload["url"], "score": hit.score} for hit in search_result]
    
    return get_top_n_chunks_by_combined_score(query_text, retrieved_docs, n=n, semantic_weight=semantic_weight, keyword_weight=keyword_weight)

def process_streamed_response(response, buffer_size=5):
    response_text, buffer = "", ""
    for chunk in response.iter_content(chunk_size=None):
        try:
            data = json.loads(chunk.decode('utf-8'))
            content = data.get("response", "")
            buffer += content

            if len(buffer) >= buffer_size:
                response_text += buffer
                clear_output(wait=True)
                display(Markdown(response_text))
                buffer = ""
                
        except json.JSONDecodeError:
            continue
            
    response_text += buffer
    clear_output(wait=True)
    display(Markdown(response_text))
    return response_text

def inspect(query, k=TOP_K, n=TOP_N):
    retrieved_docs = search_points_with_metadata(query, k=k, n=n)
    combined_docs = "\n\n".join([f"Source: {doc['url']}\n\n{doc['text']}" for doc in retrieved_docs])
    rag_prompt = f"Documents:\n\n<context>\n\n{combined_docs}\n\n</context>\n\nQuestion: {query}\n\nAnswer:\n"
    print(rag_prompt)

def ask(query, k=TOP_K, n=TOP_N):
    retrieved_docs = search_points_with_metadata(query, k=k, n=n)
    combined_docs = "\n\n".join([f"Source: {doc['url']}\n\n{doc['text']}" for doc in retrieved_docs])
    inst = ("Instruction: Please answer the following question based on following context."
            "If you do not find the answer within the following context, please respond,"
            "'Answer not found in the context.' without speculation or general knowledge."
            "'Do not start with phrase like, 'according to the context', or anything similar.")
    rag_prompt = f"{inst}\n\n<context>\n\n{combined_docs}\n\n</context>\n\nQuestion: {query}\n\nAnswer:\n"
    payload = {"model": ollama_model_name, "prompt": rag_prompt, "stream": True}
    headers = {"Content-Type": "application/json"}

    response = session.post(ollama_url_gen, headers=headers, data=json.dumps(payload), stream=True)
    response_text = process_streamed_response(response) if response.status_code == 200 else "Request failed"
    return response_text
    
try:
    store_in_qdrant_with_metadata(scraped_chunks)
    print(f'Stored {len(scraped_chunks)} relevant chunks')
except Exception as e:
    print(f"Error storing in Qdrant: {e}")

Batches:   0%|          | 0/36 [00:00<?, ?it/s]

Stored 1131 relevant chunks


In [41]:
def ask(q):
    return inspect(q)

In [42]:
_ = ask("When was Bangladesh Liberation War happened?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query Named Entities: ['Bangladesh Liberation War']
First-4 Boosted scores: [1.6317178267888255, 4.761380236393816, 0.5702600950934912, 1.1020164986765395]
Top-4 Combined scores: [1.3892509272787632, 0.7634079253577651, 0.7422620260577495, 0.6699134942933664]
Documents:

<context>

Source: https://en.wikipedia.org/wiki/List_of_wars_by_death_toll

Table (List): | War | Death range | Date | Combatants | Location ||| Afghan conflict | 1. 17 – 3 million | 1978 – present | Multiple sides ; Afghan mujahideen, later Islamic Emirate of Afghanistan, United Tajik Opposition vs. Soviet Union, Democratic Republic of Afghanistan, Northern Alliance, Tajikistan, and the United States - led coalition | Afghanistan, Pakistan and Tajikistan || Delhi Conquest of North India | 0.5–3 million | 1300–1310 | Delhi Sultanate vs. North Indian States | Indian subcontinent || Bangladesh Liberation War | 0.3–3 million | 1971 | India and Provisional Government of Bangladesh vs. Pakistan | Indian subcontinent || Mex

In [50]:
_ = ask("How many died in Bangladesh Liberation War?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query Named Entities: ['Bangladesh Liberation War']
First-4 Boosted scores: [3.0810359194788495, 0.8790158352985613, 1.0464795611702875, 1.2067768550289126]
Top-4 Combined scores: [1.0969487838957699, 0.6768555630057826, 0.6470319442340575, 0.6434675890139607]
Documents:

<context>

Source: https://en.wikipedia.org/wiki/List_of_wars_by_death_toll

Table (List): | War | Death range | Date | Combatants | Location ||| Afghan conflict | 1. 17 – 3 million | 1978 – present | Multiple sides ; Afghan mujahideen, later Islamic Emirate of Afghanistan, United Tajik Opposition vs. Soviet Union, Democratic Republic of Afghanistan, Northern Alliance, Tajikistan, and the United States - led coalition | Afghanistan, Pakistan and Tajikistan || Delhi Conquest of North India | 0.5–3 million | 1300–1310 | Delhi Sultanate vs. North Indian States | Indian subcontinent || Bangladesh Liberation War | 0.3–3 million | 1971 | India and Provisional Government of Bangladesh vs. Pakistan | Indian subcontinent || Me

In [51]:
_ = ask("When was Federal War happened?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query Named Entities: ['Federal War']
First-4 Boosted scores: [0.8928353991247043, 1.0323785226742406, 0.9704249154982434, 0.9097973634870831]
Top-4 Combined scores: [1.4003795161308563, 0.6321273525348482, 0.6281856398249409, 0.6178327430996488]
Documents:

<context>

Source: https://en.wikipedia.org/wiki/List_of_wars_by_death_toll

Table (List): | War | Death range | Date | Combatants | Location ||| Lebanese Civil War | 0. 12 – 0. 15 million | 1975 – 1990 | Multiple sides | Levant || Greek Civil War | 0.08–0.15 million | 1946–1949 | Kingdom of Greece vs. Provisional Democratic Government | Balkans and Peloponnese Peninsula || Yugoslav Wars | 0.13–0.14 million | 1991–2001 | Separatist forces and NATO vs. Socialist Federal Republic of Yugoslavia, later Federal Republic of Yugoslavia | Balkans || Irish Nine Year's War | 0.13 million | 1593–1603 | Kingdom of England vs. Irish rebels | Ireland || Chaco War | 0.08–0.13 million | 1932–1935 | Paraguay vs. Bolivia | Paraguay and Bolivia || Fe

In [52]:
_ = ask("When did Quasi-War happend?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query Named Entities: ['Quasi-War']
First-4 Boosted scores: [0.0, 0.0, 0.0, 0.0]
Top-4 Combined scores: [1.629217961652049, 0.36842660800000004, 0.35620008000000003, 0.33735992000000004]
Documents:

<context>

Source: https://en.wikipedia.org/wiki/List_of_wars:_1800%E2%80%931899

Table (1800–1810): | ('Start', 'Start') | ('Finish', 'Finish') | ('Name of conflict', 'Name of conflict') | ('Belligerents', 'Victorious party (if applicable)') | ('Belligerents', 'Defeated party (if applicable)') ||| 1765 | 1865 | Temne War | British Empire Susu Tribes | Kingdom of Koya || 1798 | 1800 | Quasi-War | United States | France || 1801 | 1805 | Tripolitan War | United States  Sweden (until 1802)  Sicily | Tripolitania  Morocco (1802) || 1801 | 1801 | War of the Oranges Part of the War of the Second Coalition | France  Spain | Portugal || 1802 | 1805 | Fourth quarter of the Haitian Revolution | Haiti | France ||

Source: https://en.wikipedia.org/wiki/List_of_wars:_before_1000

Table (References): | h

In [53]:
_ = ask("Where did Second Congo War happend?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query Named Entities: ['Second Congo War']
First-4 Boosted scores: [2.6707555382629695, 2.4112001210587786, 0.4742173729454025, 1.3371314998052841]
Top-4 Combined scores: [1.0246731556525939, 0.9601797042117557, 0.8288685882184187, 0.7940875843347115]
Documents:

<context>

Source: https://en.wikipedia.org/wiki/List_of_wars:_1990%E2%80%932002

Table (List of wars: 1990–2002): | ('Started', 'Started') | ('Ended', 'Ended') | ('Name of Conflict', 'Name of Conflict') | ('Belligerents', 'Victorious party (if applicable)') | ('Belligerents', 'Defeated party (if applicable)') ||| 1999 | 2003 | Ituri conflict Part of the Second Congo War and the Kivu conflict | Hema tribe : Union of Congolese Patriots ( UPC ) RCD - Kisangani Uganda Democratic Republic of the Congo MONUC Artemis | Lendu tribe : Nationalist and Integrationist Front ( FNI ) Front for Patriotic Resistance in Ituri ( FRPI ) Popular Front for Justice in Congo ( PFJC ) Mai - Mai Simba ||

Source: https://en.wikipedia.org/wiki/List_of

In [54]:
_ = ask("What types of killings are excluded in the list?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query Named Entities: []
First-4 Boosted scores: [6.629371764038861, 1.5088244592636266, 0.9019272447735285, 1.2017340776665637]
Top-4 Combined scores: [1.6407644008077724, 1.1101194390601332, 0.6452742724336044, 0.63233808765913]
Documents:

<context>

Source: https://en.wikipedia.org/wiki/List_of_wars_by_death_toll

This list of wars by death toll includes all deaths that are either directly or indirectly caused by war. These numbers include the deaths of military personnel which are the direct results of a battle or other military wartime actions, as well as wartime / war - related deaths of civilians which are often results of war - induced epidemics, famines, genocide, etc. Due to incomplete records, the destruction of evidence, differing methods of counting, and various other reasons, death tolls of wars have often been quite uncertain, and heavily debated.

Source: https://en.wikipedia.org/wiki/List_of_wars_by_death_toll

While the definition of war isn't entirely clear - cut, t

In [55]:
_ = ask("Which war started in 1945 ended in 1949?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query Named Entities: ['1945', '1949']
First-4 Boosted scores: [2.027656302049483, 1.6173422983483066, 3.168547074743969, 0.40260934474457977]
Top-4 Combined scores: [2.3037568645429043, 1.425824875578201, 1.069019014948794, 0.8614262204098966]
Documents:

<context>

Source: https://en.wikipedia.org/wiki/List_of_wars:_1945%E2%80%931989

Contents List of wars : 1945 – 1989 This is a list of wars that began between 1945 and 1989. Other wars can be found in the historical lists of wars and the list of wars extended by diplomatic irregularity. Major conflicts of this period include the Chinese Civil War in Asia, the Greek Civil War in Europe, the Colombian civil war known as La Violencia in South America, the Vietnam War in Southeast Asia, the Ethiopian Civil War in Africa, and the Guatemalan Civil War in North America. 1945 – 1949

Source: https://en.wikipedia.org/wiki/List_of_wars:_1945%E2%80%931989

Table (1945–1949): | ('Started', 'Started') | ('Ended', 'Ended') | ('Name of conflict', 

In [56]:
_ = ask("Ethiopian Empire vs. Emirate of Harar?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query Named Entities: ['Ethiopian Empire', 'Emirate of Harar']
First-4 Boosted scores: [1.1074289244774995, 2.2619705287002354, 0.0, 0.9813587378955455]
Top-4 Combined scores: [6.745102110936357, 1.4472712614886873, 1.111249389525159, 1.0317781736989413]
Documents:

<context>

Source: https://en.wikipedia.org/wiki/List_of_wars_by_death_toll

Table (List): | War | Death range | Date | Combatants | Location ||| Conquests of Menelik II | 6 million | 1878 – 1904 | Ethiopian Empire vs. Emirate of Harar, Kingdom of Kaffa, Kingdom of Wolaita, and allies | Horn of Africa || Second Congo War | 3–5.4 million | 1998–2003 | Multiple sides | Democratic Republic of the Congo || Spanish conquest of New Granada | 5.25 million | 1525–1540 | Spanish Empire and Klein-Venedig vs. Muisca Confederation and other civilizations | Colombia || Deccan wars | 4.6–5 million | 1680–1707 | Mughal Empire vs. Maratha Confederacy | Indian subcontinent || Nigerian Civil War | 3.04–4.1 million | 1967–1970 | Nigeria vs. B