In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
#from transformers import GPT2TokenizerFast
from transformers import AutoTokenizer
import re
from tqdm.notebook import tqdm
import time

#tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

model_path = '/Users/hissain/git/github/models/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_path)

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def init_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    service = Service()
    return webdriver.Chrome(service=service, options=options)

URL = "https://en.wikipedia.org/wiki/List_of_wars_by_death_toll"

driver = init_driver()
driver.get(URL)
time.sleep(3)

In [49]:
from transformers import AutoTokenizer

def prepare_table_for_rag_char_count(df, window_len=512):

    chunks = []
    for index, row in df.iterrows():
        for col in df.columns:
            content = row[col]
            row_id = f"Row {index + 1}"
            col_id = f"Column {col}"

            cell_entry = f"{row_id} | {col_id}: {content}"

            if len(cell_entry) > window_len:
                chunks.extend([cell_entry[i:i + window_len] for i in range(0, len(cell_entry), window_len)])
            else:
                chunks.append(cell_entry)
    return chunks

def prepare_table_for_rag_by_token_count(df, window_len_tokens, priority_cols=None):
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

    if priority_cols: df = df[priority_cols]
    
    chunks, current_chunk = [], ""
    header_text = "Table Headers: " + ", ".join(df.columns)
    header_tokens = tokenizer.encode(header_text)
    if len(header_tokens) > window_len_tokens:
        chunks.extend([header_text[i:i+window_len_tokens] for i in range(0, len(header_text), window_len_tokens)])
    else:
        current_chunk += header_text + "\n"
    
    for index, row in df.iterrows():
        row_text = [f"{col}: {val}" for col, val in row.items()]
        row_str = " | ".join(row_text)

        row_tokens = tokenizer.encode(row_str)
        
        if len(tokenizer.encode(current_chunk)) + len(row_tokens) > window_len_tokens:
            chunks.append(current_chunk.strip())
            current_chunk = header_text + "\n" + row_str + "\n"
        else:
            current_chunk += row_str + "\n"

    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

def prepare_table_for_rag_by_wordcount(df, window_len_words):
    chunks, current_chunk = [], ""
    header_text = "Table Headers: " + ", ".join(df.columns)
    header_words = header_text.split()
    if len(header_words) > window_len_words:
        header_chunk = " ".join(header_words[i:i + window_len_words] for i in range(0, len(header_words), window_len_words))
        chunks.extend(header_chunk)
    else:
        current_chunk += header_text + "\n"
    
    for index, row in df.iterrows():
        row_text = [f"{col}: {val}" for col, val in row.items()]
        row_str = " | ".join(row_text)
        row_words = row_str.split()
        
        if len(current_chunk.split()) + len(row_words) > window_len_words:
            chunks.append(current_chunk.strip())
            current_chunk = header_text + "\n" + row_str + "\n"
        else:
            current_chunk += row_str + "\n"

    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

In [46]:
from io import StringIO

html_source = driver.page_source
html_file = StringIO(html_source)
tables = pd.read_html(html_file)
tables[1]

Unnamed: 0,War,Death range,Date,Combatants,Location
0,World War II,50–85 million[4][5][6],1939–1945,Allied Powers vs. Axis Powers,Global
1,Mongol invasions and conquests,20–60 million[7][8][9][10],1207–1405,Mongol Empire vs. various states in Eurasia,Asia and Europe
2,Three Kingdoms,34 million[10],220–280,Multiple sides,China
3,Taiping Rebellion,20–30 million[11][12],1850–1864,Qing Dynasty vs. Taiping Heavenly Kingdom,China
4,World War I,15–30 million[13][14],1914–1918,Allied Powers vs. Central Powers,Global
...,...,...,...,...,...
116,Irish Nine Year's War,0.13 million[219],1593–1603,Kingdom of England vs. Irish rebels,Ireland
117,Chaco War,0.08–0.13 million[220][221][222],1932–1935,Paraguay vs. Bolivia,Paraguay and Bolivia
118,Federal War,0.1 million[223],1859–1863,Federalists vs. Conservatives,Venezuela
119,Congo Crisis,0.1 million[224],1960–1965,"Republic of the Congo, later Democratic Republ...",Republic of the Congo


In [50]:
chunks = prepare_table_for_rag_char_count(tables[1], 512)
for idx, chunk in enumerate(chunks[:2]):
    print(f"{chunk}")

Row 1 | Column War: World War II
Row 1 | Column Death range: 50–85 million[4][5][6]


In [51]:
chunks2 = prepare_table_for_rag_by_token_count(df, 512)
for idx, chunk in enumerate(chunks2[:3]):
    print(f"{chunk}\n")



Table Headers: War, Death range, Date, Combatants, Location
War: World War II | Death range: 50–85 million[4][5][6] | Date: 1939–1945 | Combatants: Allied Powers vs. Axis Powers | Location: Global
War: Mongol invasions and conquests | Death range: 20–60 million[7][8][9][10] | Date: 1207–1405 | Combatants: Mongol Empire vs. various states in Eurasia | Location: Asia and Europe
War: Three Kingdoms | Death range: 34 million[10] | Date: 220–280 | Combatants: Multiple sides | Location: China
War: Taiping Rebellion | Death range: 20–30 million[11][12] | Date: 1850–1864 | Combatants: Qing Dynasty vs. Taiping Heavenly Kingdom | Location: China
War: World War I | Death range: 15–30 million[13][14] | Date: 1914–1918 | Combatants: Allied Powers vs. Central Powers | Location: Global
War: Manchu Conquest of China | Death range: 25 million[15][16] | Date: 1618–1683 | Combatants: Manchu vs. Ming Dynasty | Location: China
War: Conquests of Timur | Death range: 7–20 million[10] | Date: 1369–1405 | Comb

In [52]:
chunks3 = prepare_table_for_rag_by_wordcount(df, 384)
for idx, chunk in enumerate(chunks3[:3]):
    print(f"{chunk}\n")

Table Headers: War, Death range, Date, Combatants, Location
War: World War II | Death range: 50–85 million[4][5][6] | Date: 1939–1945 | Combatants: Allied Powers vs. Axis Powers | Location: Global
War: Mongol invasions and conquests | Death range: 20–60 million[7][8][9][10] | Date: 1207–1405 | Combatants: Mongol Empire vs. various states in Eurasia | Location: Asia and Europe
War: Three Kingdoms | Death range: 34 million[10] | Date: 220–280 | Combatants: Multiple sides | Location: China
War: Taiping Rebellion | Death range: 20–30 million[11][12] | Date: 1850–1864 | Combatants: Qing Dynasty vs. Taiping Heavenly Kingdom | Location: China
War: World War I | Death range: 15–30 million[13][14] | Date: 1914–1918 | Combatants: Allied Powers vs. Central Powers | Location: Global
War: Manchu Conquest of China | Death range: 25 million[15][16] | Date: 1618–1683 | Combatants: Manchu vs. Ming Dynasty | Location: China
War: Conquests of Timur | Death range: 7–20 million[10] | Date: 1369–1405 | Comb