In [11]:
import re
import pickle
from bs4 import BeautifulSoup
import pandas as pd
from transformers import AutoTokenizer
import os
from io import StringIO
import logging
from tqdm.notebook import tqdm

os.environ["TOKENIZERS_PARALLELISM"] = "false"

model_path = '/Users/hissain/git/github/models/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Define maximum token length per chunk
max_token_length = 480

def get_text_content(element):
    return ' '.join(element.stripped_strings)

def chunk_text(text, max_token_length):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_token_length, len(tokens))
        chunk = tokenizer.decode(tokens[start:end])
        chunks.append(chunk)
        start = end
    return chunks

def merge_small_chunks(chunks, max_token_length):
    merged_chunks = []
    temp_chunk = ""
    
    for chunk in chunks:
        if len(tokenizer.encode(temp_chunk + " " + chunk)) <= max_token_length:
            temp_chunk += " " + chunk
        else:
            while len(tokenizer.encode(temp_chunk)) > max_token_length:
                split_point = max_token_length - 1  # Choose safe split point
                merged_chunks.append(tokenizer.decode(tokenizer.encode(temp_chunk)[:split_point]))
                temp_chunk = tokenizer.decode(tokenizer.encode(temp_chunk)[split_point:])
                
            merged_chunks.append(temp_chunk.strip())
            temp_chunk = chunk
    
    if temp_chunk:
        merged_chunks.append(temp_chunk.strip())
    
    return merged_chunks

def chunk_table(df, max_token_length, header_info):
    table_chunks = []
    current_chunk = header_info + ' ||| '
    
    for _, row in df.iterrows():
        row_text = ' | '.join([str(cell) for cell in row if pd.notna(cell)])
        combined_text = current_chunk + row_text + ' || '
        
        if len(tokenizer.encode(combined_text)) <= max_token_length:
            current_chunk += row_text + ' || '
        else:
            # Split the row if adding it would exceed max_token_length
            row_chunks = chunk_text(row_text, max_token_length)
            for sub_chunk in row_chunks:
                if len(tokenizer.encode(current_chunk)) + len(tokenizer.encode(sub_chunk)) <= max_token_length:
                    current_chunk += sub_chunk + ' || '
                else:
                    table_chunks.append(current_chunk.strip())
                    current_chunk = header_info + ' ||| ' + sub_chunk + ' || '
                    
    if current_chunk:
        table_chunks.append(current_chunk.strip())
    
    return table_chunks


def scrape_and_chunk_page(content):

    soup = BeautifulSoup(content[1], 'html.parser') # index-1 for html
    
    chunks = []
    current_url = content[0] #index-0 for url
    last_header = ""

    elements = soup.find_all(['h1', 'h2', 'h3', 'h4', 'p', 'table'])
    for element in elements:
        if element.name in ['h1', 'h2', 'h3', 'h4']:
            header_text = get_text_content(element)
            last_header = header_text
            header_chunks = chunk_text(header_text, max_token_length)
            chunks.extend([(chunk, current_url) for chunk in header_chunks])
            
        elif element.name == 'p':
            paragraph_text = get_text_content(element)
            paragraph_chunks = chunk_text(paragraph_text, max_token_length)
            chunks.extend([(chunk, current_url) for chunk in paragraph_chunks])
            
        elif element.name == 'table':
            table_html = StringIO(str(element))
            df = pd.read_html(table_html)[0]
            
            df.dropna(axis=0, how='all', inplace=True)
            df.dropna(axis=1, how='all', inplace=True)
            
            df.columns = [str(col) for col in df.columns]
            header_info = last_header + ' | ' + ' | '.join(df.columns) if not df.columns.empty else last_header
            
            table_chunks = chunk_table(df, max_token_length, header_info)
            chunks.extend([(chunk, current_url) for chunk in table_chunks])

    text_chunks = [chunk[0] for chunk in chunks]
    final_chunks = merge_small_chunks(text_chunks, max_token_length)
    
    return [(chunk, current_url) for chunk in final_chunks]

def scrape_and_chunk(html_contents):
    chunks = []
    for content in tqdm(html_contents, desc="Scraping pages"):
        chunks.extend(scrape_and_chunk_page(content))
    return chunks

with open("html_contents.pkl", "rb") as f:
    html_contents = pickle.load(f)

print(f"Loaded {len(html_contents)} URLs from pickle file")
scraped_chunks = scrape_and_chunk(html_contents)

print(f"Total Chunks: {len(scraped_chunks)}")

for chunk, url in scraped_chunks[:3]:
    print(f"Chunk: {chunk}\nSource URL: {url}\n")

Loaded 9 URLs from pickle file


Scraping pages:   0%|          | 0/9 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (528 > 512). Running this sequence through the model will result in indexing errors


Total Chunks: 299
Chunk: contents list of wars by death toll List of wars by death toll | 0 ||| Part of a series on || War (outline) || showHistory || showMilitary || showBattlespace || showWeapons || showTactics || showOperational || showStrategy || showGrand strategy || showAdministrative || showOrganization || showPersonnel || showLogistics || showScience || showLaw || showTheory || showNon-warfare || showCulture || showRelated || hideLists Battles Military occupations Military terms Operations Sieges War crimes Wars Weapons Writers || vte || this list of wars by death toll includes all deaths that are either directly or indirectly caused by war. these numbers include the deaths of military personnel which are the direct results of a battle or other military wartime actions, as well as wartime / war - related deaths of civilians which are often results of war - induced epidemics, famines, genocide, etc. due to incomplete records, the destruction of evidence, differing methods of cou

In [13]:
with open("scraped_chunks.pkl", "wb") as f:
    pickle.dump(scraped_chunks, f)

In [14]:
with open("scraped_chunks.pkl", "rb") as f:
    data = pickle.load(f)
print(len(data))

299
