In [9]:
# -------------------- imports & setup --------------------
import os, re, json, time, math, statistics
from typing import List, Dict, Tuple
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from tqdm import tqdm

# sentence/paragraph + similarity merges
# import nltk
# try:
#     nltk.data.find("tokenizers/punkt")
# except LookupError:
#     nltk.download("punkt", quiet=True)
from nltk.tokenize import sent_tokenize

from transformers import AutoTokenizer
import pandas as pd

# light-weight similarity for semantic merges
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Gemini (LLM-based chunking)
import google.generativeai as genai

# Playwright (for structural HTML fallback on SPA)
from playwright.sync_api import sync_playwright

In [10]:
# ----------------------------
# Setup
# ----------------------------
from pathlib import Path

DATA = Path(r"C:\\Users\\harsh\\OneDrive\\Desktop\\LLM Assignment 2\\Chunking")
SECTIONS_JSON = Path(r"C:\\Users\\harsh\\OneDrive\\Desktop\\LLM Assignment 2\\Scraping\\data\\jiopay_sections.json")
OUT_DIR = Path(r"C:\\Users\\harsh\\OneDrive\\Desktop\\LLM Assignment 2\\Chunking\\chunks")
ABLATION_CSV = Path(r"C:\\Users\\harsh\\OneDrive\\Desktop\\LLM Assignment 2\\Chunking\\chunking_ablation.csv")

load_dotenv()
# print(os.getenv("GEMINI_API_KEY"))
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [11]:
def tok_count(text: str) -> int:
    return len(tokenizer.encode(text or "", add_special_tokens=False))

# -------------------- structural HTML fetch --------------------
def _looks_like_spa_shell(html: str) -> bool:
    if not html: return True
    text = " ".join(BeautifulSoup(html, "lxml").stripped_strings)[:400].lower()
    return ("enable javascript" in text) or (html.count("<h1") + html.count("<h2") + html.count("<h3") < 1)

def fetch_html_via_requests(url: str) -> str:
    try:
        r = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}, timeout=20)
        r.raise_for_status()
        return r.text
    except Exception:
        return ""

_pw = None  # cache a browser across calls
def fetch_html_via_playwright(url: str) -> str:
    global _pw
    try:
        if _pw is None:
            _pw = sync_playwright().start()
            _pw.browser = _pw.chromium.launch(headless=True)
            _pw.page = _pw.browser.new_page(user_agent="Mozilla/5.0")
        _pw.page.goto(url, wait_until="networkidle", timeout=60000)
        # gentle scroll to trigger lazy content
        try:
            _pw.page.evaluate("""async () => {
                let h=document.body.scrollHeight, y=0;
                while (y<h){ y+=Math.max(300, Math.floor(window.innerHeight*0.9));
                  window.scrollTo(0,y); await new Promise(r=>setTimeout(r,80)); h=document.body.scrollHeight;}
            }""")
        except: pass
        return _pw.page.content()
    except Exception:
        return ""

def fetch_structural_html(url: str) -> str:
    html = fetch_html_via_requests(url)
    if _looks_like_spa_shell(html):
        html = fetch_html_via_playwright(url)
    return html or ""

# -------------------- structural chunking (preserve hierarchy) --------------------
def structural_chunks_from_html(html: str) -> List[str]:
    """
    Build chunks by heading hierarchy. We preserve H1/H2/H3 context.
    Each chunk = "H1 > H2 > H3\nparagraph block"
    """
    if not html: return []
    soup = BeautifulSoup(html, "lxml")
    for el in soup(["script","style","noscript","svg","header","footer","nav"]):
        el.decompose()

    elems = soup.find_all(["h1","h2","h3","p","li"], recursive=True)
    h = {1: None, 2: None, 3: None}
    buf, chunks = [], []

    def flush():
        nonlocal buf
        if buf:
            heading_path = " > ".join([x for x in [h[1], h[2], h[3]] if x])
            prefix = (heading_path + "\n") if heading_path else ""
            chunks.append(prefix + " ".join(buf))
            buf = []

    for el in elems:
        txt = el.get_text(" ", strip=True)
        if not txt: continue
        if el.name in ("h1","h2","h3"):
            flush()
            lvl = int(el.name[1])
            h[lvl] = txt
            for k in range(lvl+1,4): h[k]=None
        else:
            buf.append(txt)
    flush()
    # drop tiny whitespace-only chunks
    chunks = [c.strip() for c in chunks if tok_count(c.strip()) > 0]
    return chunks

# -------------------- fixed chunking --------------------
FIXED_SIZES = [256, 512, 1024]
FIXED_OVERLAPS = [0, 64, 128]

def fixed_chunks(text: str, size: int, overlap: int) -> List[str]:
    ids = tokenizer.encode(text or "", add_special_tokens=False)
    chunks = []
    if not ids: return chunks
    i = 0
    while i < len(ids):
        part = ids[i:i+size]
        chunks.append(tokenizer.decode(part))
        step = max(1, size - overlap)
        i += step
    return chunks

# -------------------- semantic chunking (sentence/paragraph + sim merges) --------------------
def _sentences(text: str) -> List[str]:
    try:
        return sent_tokenize(text)
    except LookupError:
        # fallback: crude regex
        return re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)

def _merge_by_similarity(units: List[str], target_tokens=512, sim_threshold=0.25) -> List[str]:
    if not units: return []
    chunks, cur = [], []
    vec = TfidfVectorizer(min_df=1, stop_words=None).fit(units)
    for u in units:
        if not cur:
            cur.append(u); continue
        # check size
        if tok_count(" ".join(cur + [u])) <= target_tokens:
            # similarity with last sentence/paragraph
            A = vec.transform([cur[-1]])
            B = vec.transform([u])
            sim = float(cosine_similarity(A, B)[0][0])
            if sim >= sim_threshold:
                cur.append(u)
            else:
                chunks.append(" ".join(cur)); cur=[u]
        else:
            chunks.append(" ".join(cur)); cur=[u]
    if cur: chunks.append(" ".join(cur))
    return chunks

def semantic_sentence_chunks(text: str, target_tokens=512, sim_threshold=0.25) -> List[str]:
    units = _sentences(text or "")
    return _merge_by_similarity(units, target_tokens, sim_threshold)

def semantic_paragraph_chunks(text: str, target_tokens=512, sim_threshold=0.20) -> List[str]:
    # split paragraphs on 2+ newlines as a light heuristic
    paras = [p.strip() for p in re.split(r'\n\s*\n+', text or "") if p.strip()]
    if not paras:
        paras = [text] if text else []
    return _merge_by_similarity(paras, target_tokens, sim_threshold)

# -------------------- recursive chunking --------------------
def recursive_chunks(text: str, max_tokens=512) -> List[str]:
    """structural -> semantic -> fixed"""
    # try structural on text converted to pseudo-HTML? better: just semantic if we only have text
    # Here: we do semantic sentences first; any oversize becomes fixed.
    out = []
    for s in semantic_sentence_chunks(text, target_tokens=max_tokens):
        if tok_count(s) <= max_tokens:
            out.append(s)
        else:
            out.extend(fixed_chunks(s, size=max_tokens, overlap=64))
    return out

# -------------------- LLM-based chunking (Gemini) --------------------
def gemini_chunk(text: str, target_tokens=300, model="gemini-1.5-flash") -> List[str]:
    prompt = f"""You are a professional technical editor.
Split the following text into coherent chunks of about {target_tokens} tokens each.
Each chunk should contain a single topical unit. Return ONLY valid JSON: a list of strings.

TEXT:
{text}
"""
    try:
        resp = genai.GenerativeModel(model).generate_content(prompt)
        out = resp.text.strip()
        if out.startswith("```"):
            out = re.sub(r"^```(json)?", "", out, flags=re.M).strip("` \n")
        parsed = json.loads(out)
        chunks = [c for c in parsed if isinstance(c, str) and c.strip()]
        return chunks
    except Exception as e:
        print("Gemini chunking failed:", e)
        return []

# -------------------- run all strategies & save --------------------
def save_jsonl(path: Path, rows: List[Dict]):
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

def run_all_chunking():
    docs = json.loads(SECTIONS_JSON.read_text(encoding="utf-8"))
    ablation_rows = []

    # --- 1) FIXED (9 configs) ---
    for size in FIXED_SIZES:
        for overlap in FIXED_OVERLAPS:
            cfg_name = f"fixed_s{size}_o{overlap}"
            rows, t0 = [], time.time()
            for d in tqdm(docs, desc=cfg_name):
                url, section, text = d.get("url",""), d.get("section",""), d.get("text","")
                if not text: continue
                for ch in fixed_chunks(text, size=size, overlap=overlap):
                    rows.append({"strategy":"fixed", "config":cfg_name, "url":url, "section":section,
                                 "text":ch, "tokens":tok_count(ch)})
            elapsed = time.time()-t0
            save_jsonl(OUT_DIR / f"chunks_{cfg_name}.jsonl", rows)
            if rows:
                toks = [r["tokens"] for r in rows]
                ablation_rows.append({
                    "strategy":"fixed", "config":cfg_name,
                    "#chunks": len(rows),
                    "tokens_total": sum(toks),
                    "avg_tokens": round(statistics.mean(toks),2),
                    "std_tokens": round(statistics.pstdev(toks),2) if len(toks)>1 else 0.0,
                    "time_sec": round(elapsed,2),
                    "redundancy_pct": round((1 - max(1, size - overlap)/size)*100, 2) if overlap>0 else 0.0
                })
            else:
                ablation_rows.append({"strategy":"fixed","config":cfg_name,"#chunks":0,"tokens_total":0,
                                      "avg_tokens":0,"std_tokens":0,"time_sec":round(elapsed,2),
                                      "redundancy_pct":0.0})

    # --- 2) SEMANTIC (sentence + paragraph) ---
    for mode in ["sentence","paragraph"]:
        cfg_name = f"semantic_{mode}_t512"
        rows, t0 = [], time.time()
        for d in tqdm(docs, desc=cfg_name):
            url, section, text = d.get("url",""), d.get("section",""), d.get("text","")
            if not text: continue
            if mode == "sentence":
                chunks = semantic_sentence_chunks(text, target_tokens=512, sim_threshold=0.25)
            else:
                chunks = semantic_paragraph_chunks(text, target_tokens=512, sim_threshold=0.20)
            for ch in chunks:
                rows.append({"strategy":"semantic", "config":cfg_name, "url":url, "section":section,
                             "text":ch, "tokens":tok_count(ch)})
        elapsed = time.time()-t0
        save_jsonl(OUT_DIR / f"chunks_{cfg_name}.jsonl", rows)
        toks = [r["tokens"] for r in rows] if rows else []
        ablation_rows.append({
            "strategy":"semantic", "config":cfg_name,
            "#chunks": len(rows), "tokens_total": sum(toks) if toks else 0,
            "avg_tokens": round(statistics.mean(toks),2) if toks else 0,
            "std_tokens": round(statistics.pstdev(toks),2) if len(toks)>1 else 0,
            "time_sec": round(elapsed,2), "redundancy_pct": 0.0
        })

    # --- 3) STRUCTURAL (fetch real HTML; preserve headings) ---
    cfg_name = "structural_html"
    rows, t0 = [], time.time()
    for d in tqdm(docs, desc=cfg_name):
        url, section = d.get("url",""), d.get("section","")
        html = fetch_structural_html(url)
        chunks = structural_chunks_from_html(html)
        for ch in chunks:
            rows.append({"strategy":"structural","config":cfg_name,"url":url,"section":section,
                         "text":ch, "tokens":tok_count(ch)})
    elapsed = time.time()-t0
    save_jsonl(OUT_DIR / f"chunks_{cfg_name}.jsonl", rows)
    toks = [r["tokens"] for r in rows] if rows else []
    ablation_rows.append({
        "strategy":"structural","config":cfg_name,
        "#chunks": len(rows), "tokens_total": sum(toks) if toks else 0,
        "avg_tokens": round(statistics.mean(toks),2) if toks else 0,
        "std_tokens": round(statistics.pstdev(toks),2) if len(toks)>1 else 0,
        "time_sec": round(elapsed,2), "redundancy_pct": 0.0
    })

    # --- 4) RECURSIVE (structural -> semantic -> fixed) ---
    cfg_name = "recursive_t512"
    rows, t0 = [], time.time()
    for d in tqdm(docs, desc=cfg_name):
        url, section = d.get("url",""), d.get("section","")
        # Start from structural blocks; if empty, fallback to the text field
        html = fetch_structural_html(url)
        base_blocks = structural_chunks_from_html(html)
        if not base_blocks:
            base_blocks = [d.get("text","")]
        for block in base_blocks:
            for ch in recursive_chunks(block, max_tokens=512):
                rows.append({"strategy":"recursive","config":cfg_name,"url":url,"section":section,
                             "text":ch, "tokens":tok_count(ch)})
    elapsed = time.time()-t0
    save_jsonl(OUT_DIR / f"chunks_{cfg_name}.jsonl", rows)
    toks = [r["tokens"] for r in rows] if rows else []
    ablation_rows.append({
        "strategy":"recursive","config":cfg_name,
        "#chunks": len(rows), "tokens_total": sum(toks) if toks else 0,
        "avg_tokens": round(statistics.mean(toks),2) if toks else 0,
        "std_tokens": round(statistics.pstdev(toks),2) if len(toks)>1 else 0,
        "time_sec": round(elapsed,2), "redundancy_pct": 0.0
    })

    # --- 5) LLM-BASED (Gemini) ---
    cfg_name = "llm_gemini_flash_t300"
    rows, t0 = [], time.time()
    approx_in_tokens = 0; approx_out_tokens = 0
    for d in tqdm(docs, desc=cfg_name):
        url, section, text = d.get("url",""), d.get("section",""), d.get("text","")
        if not text: continue
        # keep prompt sizes manageable
        clip = text if len(text) < 12000 else text[:12000]
        approx_in_tokens += math.ceil(len(clip)/4)
        chunks = gemini_chunk(clip, target_tokens=300, model="gemini-1.5-flash")
        for ch in chunks:
            rows.append({"strategy":"llm","config":cfg_name,"url":url,"section":section,
                         "text":ch, "tokens":tok_count(ch)})
            approx_out_tokens += math.ceil(len(ch)/4)
    elapsed = time.time()-t0
    save_jsonl(OUT_DIR / f"chunks_{cfg_name}.jsonl", rows)
    toks = [r["tokens"] for r in rows] if rows else []
    ablation_rows.append({
        "strategy":"llm","config":cfg_name,
        "#chunks": len(rows), "tokens_total": sum(toks) if toks else 0,
        "avg_tokens": round(statistics.mean(toks),2) if toks else 0,
        "std_tokens": round(statistics.pstdev(toks),2) if len(toks)>1 else 0,
        "time_sec": round(elapsed,2),
        "redundancy_pct": 0.0,
        "approx_in_tokens": approx_in_tokens,
        "approx_out_tokens": approx_out_tokens,
        "model":"gemini-1.5-flash"
    })

    # Write ablation table
    pd.DataFrame(ablation_rows).to_csv(ABLATION_CSV, index=False)
    print("Saved:", ABLATION_CSV)

In [12]:
# -------------------- run --------------------
run_all_chunking()

# Cleanup Playwright if opened
try:
    if _pw is not None:
        _pw.page.close()
        _pw.browser.close()
        _pw.stop()
except: pass

fixed_s256_o0:   0%|          | 0/16 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (536 > 512). Running this sequence through the model will result in indexing errors
fixed_s256_o0: 100%|██████████| 16/16 [00:00<00:00, 48.83it/s]
fixed_s256_o64: 100%|██████████| 16/16 [00:00<00:00, 48.84it/s]
fixed_s256_o128: 100%|██████████| 16/16 [00:00<00:00, 33.90it/s]
fixed_s512_o0: 100%|██████████| 16/16 [00:00<00:00, 53.30it/s]
fixed_s512_o64: 100%|██████████| 16/16 [00:00<00:00, 49.78it/s]
fixed_s512_o128: 100%|██████████| 16/16 [00:00<00:00, 42.64it/s] 
fixed_s1024_o0: 100%|██████████| 16/16 [00:00<00:00, 54.52it/s]
fixed_s1024_o64: 100%|██████████| 16/16 [00:00<00:00, 48.54it/s] 
fixed_s1024_o128: 100%|██████████| 16/16 [00:00<00:00, 53.08it/s]
semantic_sentence_t512: 100%|██████████| 16/16 [00:01<00:00, 14.57it/s]
semantic_paragraph_t512: 100%|██████████| 16/16 [00:00<00:00, 97.24it/s]
structural_html: 100%|██████████| 16/16

Gemini chunking failed: Extra data: line 6 column 1 (char 11927)
Saved: C:\Users\harsh\OneDrive\Desktop\LLM Assignment 2\Chunking\chunking_ablation.csv





In [15]:
# === Extended Chunking Evaluation (Weighted Scoring) ===
# Produces: data/chunking_ablation_weighted.csv and prints a ranked table.

import json, re, math
from pathlib import Path
from collections import defaultdict, Counter
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score

# ---------- Helpers ----------
def _load_chunks(pattern: str):
    """Load all chunks_*.jsonl files that match a simple substring pattern on filename."""
    rows = []
    for p in OUT_DIR.glob(pattern):
        with p.open("r", encoding="utf-8") as f:
            for line in f:
                try:
                    row = json.loads(line)
                    # expected fields: strategy, config, url, section, text, tokens
                    rows.append(row)
                except:
                    pass
    return rows

def _tokenize_lower(s: str):
    return re.findall(r"[A-Za-z0-9]+", (s or "").lower())

def _tfidf_matrix(texts):
    # light TF-IDF (no stopwords to keep domain terms)
    vec = TfidfVectorizer(min_df=1)
    X = vec.fit_transform(texts)
    return X, vec

def _normalize_01(series):
    # avoid divide by zero
    if isinstance(series, list):
        series = np.array(series, dtype=float)
    mn, mx = float(np.min(series)), float(np.max(series))
    if mx - mn < 1e-9:
        return np.ones_like(series) * 1.0
    return (series - mn) / (mx - mn)

# ---------- Load base corpus (for completeness) ----------
docs = json.loads(SECTIONS_JSON.read_text(encoding="utf-8"))
# map url -> token set for coverage measure
url_tokens = {}
for d in docs:
    url = d.get("url","")
    text = d.get("text","")
    url_tokens[url] = set(_tokenize_lower(text))

# ---------- Domain keyword list (adjustable) ----------
DOMAIN_TERMS = set("""
jio jiopay business merchant settlement refunds dispute chargeback kyc aml onboarding grievance
billpay biller pos "point of sale" upi "upi hub" payment gateway checkout intent qr "soundbox" "biller centre"
privacy policy terms conditions complaint resolution investor relations help center faq invoice reconciliation
""".replace('"','').split())

# ---------- Load your original ablation (counts/size/time) ----------
abl = pd.read_csv(ABLATION_CSV)

# ---------- Build strategy->chunks map ----------
# We detect available strategies/configs by scanning OUT_DIR files
all_files = list(OUT_DIR.glob("chunks_*.jsonl"))
if not all_files:
    raise SystemExit("No chunk files found in OUT_DIR. Run chunking first.")

# Load all chunks grouped by (strategy, config)
grouped = defaultdict(list)
for fp in all_files:
    # filename: chunks_<config>.jsonl (we stored strategy inside rows)
    with fp.open("r", encoding="utf-8") as f:
        for line in f:
            try:
                r = json.loads(line)
            except:
                continue
            strat = r.get("strategy","unknown")
            cfg = r.get("config","unknown")
            grouped[(strat, cfg)].append(r)

# ---------- Metrics per (strategy, config) ----------
rows = []
for (strategy, config), chunks in grouped.items():
    if not chunks:
        continue

    # --- Size-band Fit (300-600 tokens) ---
    toks = np.array([c.get("tokens",0) for c in chunks], dtype=int)
    size_band_pct = float(( (toks >= 300) & (toks <= 600) ).sum()) / max(1, len(toks))

    # --- Info Density: domain keyword hits per 100 tokens ---
    dens_vals = []
    for c in chunks:
        T = c.get("text","")
        toks_c = _tokenize_lower(T)
        if not toks_c:
            continue
        hits = sum(1 for t in toks_c if t in DOMAIN_TERMS)
        dens = 100.0 * hits / max(1, len(toks_c))
        dens_vals.append(dens)
    info_density = float(np.mean(dens_vals)) if dens_vals else 0.0

    # --- Semantic Coherence (within each URL): mean max-neighbor similarity ---
    # Compute TF-IDF per strategy-config across all its chunks
    texts = [c.get("text","") for c in chunks]
    X, vec = _tfidf_matrix(texts)
    # index chunks by url to compare neighbors within same source doc
    by_url_idx = defaultdict(list)
    for i, c in enumerate(chunks):
        by_url_idx[c.get("url","")].append(i)

    sim_scores = []
    for u, idxs in by_url_idx.items():
        if len(idxs) < 2:
            continue
        Xi = X[idxs]
        sims = cosine_similarity(Xi)
        # for each row, take top-1 neighbor similarity (excluding self)
        for i in range(sims.shape[0]):
            row = sims[i].copy()
            row[i] = -1.0
            sim_scores.append(float(np.max(row)))
    semantic_coherence = float(np.mean(sim_scores)) if sim_scores else 0.0

    # --- Completeness: token coverage vs original url text + continuity penalty ---
    cover_scores = []
    small_or_huge = 0
    for u, idxs in by_url_idx.items():
        # coverage = |union(chunk_tokens)| / |doc_tokens|
        doc_tok = url_tokens.get(u, set())
        if not doc_tok:
            continue
        union = set()
        for i in idxs:
            union |= set(_tokenize_lower(texts[i]))
        cov = float(len(union & doc_tok)) / max(1, len(doc_tok))
        cover_scores.append(cov)

    # continuity penalty: fraction of chunks <80 or >800 tokens
    small_or_huge = float(((toks < 80) | (toks > 800)).sum()) / max(1, len(toks))
    completeness = 0.5 * (np.mean(cover_scores) if cover_scores else 0.0) + 0.5 * (1.0 - small_or_huge)

    # --- Domain grouping quality: cluster vs section label alignment (NMI) ---
    # K = number of unique sections but cap to [2, 12]
    sections = [ (c.get("section") or "").strip() or "NA" for c in chunks ]
    uniq_sections = [s for s, _ in Counter(sections).most_common()]
    K = min(max(len(uniq_sections), 2), 12)
    try:
        km = KMeans(n_clusters=K, n_init="auto", random_state=42)
        labels = km.fit_predict(X)
        nmi = normalized_mutual_info_score(sections, labels)
    except Exception:
        nmi = 0.0

    rows.append({
        "strategy": strategy,
        "config": config,
        "size_band_pct": round(size_band_pct, 4),
        "info_density": round(info_density, 4),
        "semantic_coherence": round(semantic_coherence, 4),
        "completeness": round(completeness, 4),
        "domain_grouping_nmi": round(nmi, 4),
    })

eval_df = pd.DataFrame(rows)

# ---------- Merge with base ablation (for time/chunk counts) ----------
# Expect columns in abl: strategy, config, #chunks, tokens_total, avg_tokens, std_tokens, time_sec, redundancy_pct
merged = abl.merge(eval_df, on=["strategy","config"], how="left")

# ---------- Performance & weighted score ----------
# Throughput = #chunks / time_sec (avoid 0)
merged["throughput"] = merged.apply(lambda r: (r["#chunks"] / r["time_sec"]) if (r.get("time_sec",0)>0) else 0.0, axis=1)

# Normalize metrics to [0,1] per column where higher is better
for col in ["semantic_coherence","completeness","size_band_pct","info_density","domain_grouping_nmi","throughput"]:
    merged[f"{col}_norm"] = _normalize_01(merged[col].fillna(0.0).to_numpy())

# Weighted rubric (like your classmate’s)
# Retrieval Quality (40%) = 20% semantic_coherence + 20% completeness
retrieval_quality = 0.20*merged["semantic_coherence_norm"] + 0.20*merged["completeness_norm"]

# Size Optimization (25%) = size_band_pct
size_optimization = 0.25*merged["size_band_pct_norm"]

# Domain-Specific (25%) = 10% info_density + 15% domain_grouping_nmi
domain_specific = 0.10*merged["info_density_norm"] + 0.15*merged["domain_grouping_nmi_norm"]

# Performance (10%) = throughput
performance = 0.10*merged["throughput_norm"]

merged["weighted_score"] = retrieval_quality + size_optimization + domain_specific + performance
merged = merged.sort_values("weighted_score", ascending=False)

# Save & display
weighted_csv = ABLATION_CSV.parent / "chunking_ablation_weighted.csv"
merged.to_csv(weighted_csv, index=False)

print("Saved weighted evaluation ->", weighted_csv)
display(merged[[
    "strategy","config","#chunks","avg_tokens","time_sec",
    "semantic_coherence","completeness","size_band_pct","info_density","domain_grouping_nmi","throughput",
    "weighted_score"
]].head(20))


Saved weighted evaluation -> C:\Users\harsh\OneDrive\Desktop\LLM Assignment 2\Chunking\chunking_ablation_weighted.csv


Unnamed: 0,strategy,config,#chunks,avg_tokens,time_sec,semantic_coherence,completeness,size_band_pct,info_density,domain_grouping_nmi,throughput,weighted_score
4,fixed,fixed_s512_o64,100,456.44,0.33,0.6297,0.985,0.88,19.1638,0.5119,303.030303,0.737413
3,fixed,fixed_s512_o0,88,459.28,0.3,0.6042,0.9659,0.8864,19.0108,0.5389,293.333333,0.732031
5,fixed,fixed_s512_o128,114,463.24,0.38,0.6802,0.9912,0.8684,16.648,0.5022,300.0,0.721228
10,semantic,semantic_paragraph_t512,16,2525.31,0.17,1.0,0.6875,0.3125,19.2189,0.9003,94.117647,0.624645
2,fixed,fixed_s256_o128,325,242.51,0.48,0.7561,0.9831,0.0,16.187,0.4431,677.083333,0.527702
1,fixed,fixed_s256_o64,217,245.44,0.33,0.6285,0.9885,0.0,15.1522,0.4213,657.575758,0.486846
0,fixed,fixed_s256_o0,168,240.67,0.33,0.5293,0.9762,0.0,17.2019,0.4441,509.090909,0.464757
8,fixed,fixed_s1024_o128,53,846.79,0.3,0.6995,0.6132,0.1698,17.2407,0.6379,176.666667,0.439918
7,fixed,fixed_s1024_o64,49,866.88,0.33,0.7063,0.5918,0.1224,17.1009,0.6823,148.484848,0.426444
6,fixed,fixed_s1024_o0,46,878.46,0.3,0.705,0.6087,0.1304,16.0205,0.644,153.333333,0.415829
