In [None]:
import os
import re
import pickle
import pandas as pd

from io import StringIO
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from transformers import AutoTokenizer

os.environ["TOKENIZERS_PARALLELISM"] = "false"

#model_path = '/Users/hissain/git/github/models/all-MiniLM-L6-v2'
model_path = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_path, force_download=True)

# Define maximum token length per chunk
max_token_length = 350

def clean(text):
    clean_text = re.sub(r'\[\s*\d+\s*\]', '', text)
    return clean_text

def get_text_content(element):
    return ' '.join(str(e) for e in element.stripped_strings)

def chunk_text(text, max_token_length):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_token_length, len(tokens))
        chunk = tokenizer.decode(tokens[start:end])
        chunks.append(chunk)
        start = end
    return chunks

def merge_small_chunks(chunks, max_token_length):
    merged_chunks = []
    temp_chunk = ""
    
    for chunk in chunks:
        if len(tokenizer.encode(temp_chunk + " " + chunk, add_special_tokens=False)) <= max_token_length:
            temp_chunk += " " + chunk
        else:
            while len(tokenizer.encode(temp_chunk, add_special_tokens=False)) > max_token_length:
                split_point = max_token_length - 1
                merged_chunks.append(tokenizer.decode(tokenizer.encode(temp_chunk, add_special_tokens=False)[:split_point]))
                temp_chunk = tokenizer.decode(tokenizer.encode(temp_chunk, add_special_tokens=False)[split_point:])
                
            merged_chunks.append(temp_chunk.strip())
            temp_chunk = chunk
    
    if temp_chunk:
        merged_chunks.append(temp_chunk.strip())
    
    return merged_chunks

def chunk_table(df, max_token_length, header_info):
    table_chunks = []
    current_chunk = header_info + ' ||| '
    
    for _, row in df.iterrows():
        row_text = ' | '.join([str(cell) for cell in row if pd.notna(cell)])
        row_text = clean(row_text)
        combined_text = current_chunk + row_text + ' || '
        
        if len(tokenizer.encode(combined_text)) <= max_token_length:
            current_chunk += row_text + ' || '
        else:
            row_chunks = chunk_text(row_text, max_token_length)
            for sub_chunk in row_chunks:
                if len(tokenizer.encode(current_chunk)) + len(tokenizer.encode(sub_chunk)) <= max_token_length:
                    current_chunk += sub_chunk + ' || '
                else:
                    table_chunks.append(current_chunk.strip())
                    current_chunk = header_info + ' ||| ' + sub_chunk + ' || '
                    
    if current_chunk:
        table_chunks.append(current_chunk.strip())
    
    return table_chunks


def scrape_and_chunk_page(content):

    soup = BeautifulSoup(content[1], 'html.parser') # index-1 for html
    
    chunks = []
    current_url = content[0] #index-0 for url
    last_header = ""

    elements = soup.find_all(['h1', 'h2', 'h3', 'h4', 'p', 'table'])
    for element in elements:
        if element.name in ['h1', 'h2', 'h3', 'h4']:
            header_text = get_text_content(element)
            header_text = clean(header_text)
            last_header = "\nTable (" + header_text + "):"
            header_chunks = chunk_text(header_text, max_token_length)
            chunks.extend([(chunk, current_url) for chunk in header_chunks])
            
        elif element.name == 'p':
            paragraph_text = get_text_content(element)
            paragraph_text = clean(paragraph_text)
            paragraph_chunks = chunk_text(paragraph_text, max_token_length)
            chunks.extend([(chunk, current_url) for chunk in paragraph_chunks])
            
        elif element.name == 'table':
            table_html = StringIO(str(element))
            df = pd.read_html(table_html)[0]
            
            if df.empty:
                continue
            
            df.dropna(axis=0, how='all', inplace=True)
            df.dropna(axis=1, how='all', inplace=True)
            
            df.columns = [str(col) for col in df.columns]
            header_info = last_header + ' | ' + ' | '.join(df.columns) if not df.columns.empty else last_header
            
            table_chunks = chunk_table(df, max_token_length, header_info)
            chunks.extend([(chunk, current_url) for chunk in table_chunks])

    text_chunks = [chunk[0] for chunk in chunks]
    final_chunks = merge_small_chunks(text_chunks, max_token_length)
    
    return [(chunk, current_url) for chunk in final_chunks]

def scrape_and_chunk(html_contents):
    chunks = []
    for content in tqdm(html_contents, desc="Scraping pages"):
        chunks.extend(scrape_and_chunk_page(content))
    return chunks

with open("html_contents.pkl", "rb") as f:
    html_contents = pickle.load(f)

print(f"Loaded {len(html_contents)} URLs from pickle file")
scraped_chunks = scrape_and_chunk(html_contents)

print(f"Total Chunks: {len(scraped_chunks)}")

for chunk, url in scraped_chunks[:2]:
    print(f"Chunk: {chunk}\nSource URL: {url}\n")

In [None]:
import spacy
from collections import Counter
import re
import numpy as np
from qdrant_client import QdrantClient, models
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
from IPython.display import display, clear_output, Markdown
import requests
import json
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from rank_bm25 import BM25Okapi

# Load SpaCy's English model for Named Entity Recognition
nlp = spacy.load("en_core_web_sm")

session = requests.Session()
retry = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.headers.update({"Connection": "keep-alive", "Content-Type": "application/json"})

qdrant_url = "http://localhost:6333"
collection_name = "wiki_collection"
ollama_url_gen = "http://localhost:11434/api/generate"
ollama_model_name = "llama3.2:latest"

client = QdrantClient(url=qdrant_url)

model_path_st = '/Users/hissain/git/github/models/all-MiniLM-L6-v2'
embedding_model = SentenceTransformer(model_path_st)

TOP_K = 10
TOP_N = 4
SYM_W = 0.8
SYN_W = 0.2
NE_BOOST_FACTOR = 1.2
NE_FULL_BOOST_FACTOR = 1.2

def get_embeddings(texts):
    return embedding_model.encode(texts, batch_size=32, show_progress_bar=True)

def create_collection(dimension):
    client.delete_collection(collection_name=collection_name)
    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(size=dimension, distance=models.Distance.COSINE),
    )

def upsert_points_with_metadata(embeddings, chunks, batch_size=100):
    points = [
        models.PointStruct(
            id=i,
            vector=embedding.tolist(),
            payload={"text": chunk, "url": url}
        ) for i, (embedding, (chunk, url)) in enumerate(zip(embeddings, chunks))
    ]

    for i in tqdm(range(0, len(points), batch_size), desc="Storing to Qdrant"):
        batch_points = points[i:i + batch_size]
        client.upsert(collection_name=collection_name, points=batch_points)
        time.sleep(0.01)

def store_in_qdrant_with_metadata(chunks):
    dimension = 384
    create_collection(dimension)
    chunk_texts = [chunk for chunk, _ in chunks]
    embeddings = get_embeddings(chunk_texts)
    upsert_points_with_metadata(embeddings, chunks)

def search_points_with_metadata(query_text, k=TOP_K):
    query_embedding = get_embeddings([query_text])[0]
    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_embedding.tolist(),
        limit=k,
        with_payload=True
    )
    return [{"text": hit.payload["text"], "url": hit.payload["url"], "score": hit.score} for hit in search_result]

def init_bm25(corpus_texts):
    tokenized_corpus = [text.split() for text in corpus_texts]
    return BM25Okapi(tokenized_corpus)

def calculate_bm25_scores(bm25, query_text):
    tokenized_query = query_text.split()
    return bm25.get_scores(tokenized_query)

def extract_named_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents]

def boost_ne_scores(query_text, docs, bm25_scores, boost_factor=NE_BOOST_FACTOR, full_match_boost=NE_FULL_BOOST_FACTOR):
    query_entities = extract_named_entities(query_text)
    print(f"Query Named Entities: {query_entities}")
    
    boosted_scores = []
    for idx, (doc, bm25_score) in enumerate(zip(docs, bm25_scores)):
        doc_entities = extract_named_entities(doc["text"])
        matching_ne_count = sum(1 for ne in query_entities if ne in doc_entities)
        full_match = all(ne in doc_entities for ne in query_entities)
        ne_boost = 1 + (boost_factor * matching_ne_count)
        if full_match:
            ne_boost *= full_match_boost
        boosted_scores.append(bm25_score * ne_boost)

    print(f"First-4 Boosted scores: {boosted_scores[:4]}")          
    return boosted_scores

def get_top_n_chunks_by_combined_score(query_text, retrieved_docs, n=TOP_N, semantic_weight=SYM_W, keyword_weight=SYN_W):
    
    bm25 = init_bm25([doc["text"] for doc in retrieved_docs])
    bm25_scores = calculate_bm25_scores(bm25, query_text)
    boosted_keyword_scores = boost_ne_scores(query_text, retrieved_docs, bm25_scores)

    scored_chunks = []
    
    for idx, doc in enumerate(retrieved_docs):
        semantic_score = doc["score"]
        keyword_score = boosted_keyword_scores[idx]
        combined_score = (semantic_weight * semantic_score) + (keyword_weight * keyword_score)
        scored_chunks.append({"text": doc["text"], "url": doc["url"], "combined_score": combined_score})

    scored_chunks.sort(key=lambda n: n["combined_score"], reverse=True)
    print(f"Top-4 Combined scores: {[s['combined_score'] for s in scored_chunks[:4]]}")
    return scored_chunks[:n]

def search_points_with_metadata(query_text, k=TOP_K, n=TOP_N, semantic_weight=SYM_W, keyword_weight=SYN_W):
    query_embedding = get_embeddings([query_text])[0]
    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_embedding.tolist(),
        limit=k,
        with_payload=True
    )
    
    retrieved_docs = [{"text": hit.payload["text"], "url": hit.payload["url"], "score": hit.score} for hit in search_result]
    
    return get_top_n_chunks_by_combined_score(query_text, retrieved_docs, n=n, semantic_weight=semantic_weight, keyword_weight=keyword_weight)

def process_streamed_response(response, buffer_size=5):
    response_text, buffer = "", ""
    for chunk in response.iter_content(chunk_size=None):
        try:
            data = json.loads(chunk.decode('utf-8'))
            content = data.get("response", "")
            buffer += content

            if len(buffer) >= buffer_size:
                response_text += buffer
                clear_output(wait=True)
                display(Markdown(response_text))
                buffer = ""
                
        except json.JSONDecodeError:
            continue
            
    response_text += buffer
    clear_output(wait=True)
    display(Markdown(response_text))
    return response_text

def inspect(query, k=TOP_K, n=TOP_N):
    retrieved_docs = search_points_with_metadata(query, k=k, n=n)
    combined_docs = "\n\n".join([f"Source: {doc['url']}\n\n{doc['text']}" for doc in retrieved_docs])
    rag_prompt = f"Documents:\n\n<context>\n\n{combined_docs}\n\n</context>\n\nQuestion: {query}\n\nAnswer:\n"
    print(rag_prompt)

def ask(query, k=TOP_K, n=TOP_N):
    retrieved_docs = search_points_with_metadata(query, k=k, n=n)
    combined_docs = "\n\n".join([f"Source: {doc['url']}\n\n{doc['text']}" for doc in retrieved_docs])
    inst = ("Instruction: Please answer the following question based on following context."
            "If you do not find the answer within the following context, please respond,"
            "'Answer not found in the context.' without speculation or general knowledge."
            "'Do not start with phrase like, 'according to the context', or anything similar.")
    rag_prompt = f"{inst}\n\n<context>\n\n{combined_docs}\n\n</context>\n\nQuestion: {query}\n\nAnswer:\n"
    payload = {"model": ollama_model_name, "prompt": rag_prompt, "stream": True}
    headers = {"Content-Type": "application/json"}

    response = session.post(ollama_url_gen, headers=headers, data=json.dumps(payload), stream=True)
    response_text = process_streamed_response(response) if response.status_code == 200 else "Request failed"
    return response_text
    
try:
    store_in_qdrant_with_metadata(scraped_chunks)
    print(f'Stored {len(scraped_chunks)} relevant chunks')
except Exception as e:
    print(f"Error storing in Qdrant: {e}")

In [None]:
def ask(q):
    return inspect(q)

In [None]:
_ = ask("When was Bangladesh Liberation War happened?")

In [None]:
_ = ask("How many died in Bangladesh Liberation War?")

In [None]:
_ = ask("When was Federal War happened?")

In [None]:
_ = ask("When did Quasi-War happend?")

In [None]:
_ = ask("Where did Second Congo War happend?")

In [None]:
_ = ask("What types of killings are excluded in the list of war by death toll?")

In [None]:
_ = ask("Which war started in 1945 ended in 1949?")

In [None]:
_ = ask("Ethiopian Empire vs. Emirate of Harar?")