In [None]:
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from numpy.linalg import norm


def cosine(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))


def split_sentences(text):
    text = text.strip()
    # g·ªôp nhi·ªÅu newline li√™n ti·∫øp th√†nh 1
    text = re.sub(r'\n+', '\n', text)

    return [
        s.strip()
        for s in re.split(r'(?<=[.!?])\s+|\n', text)
        if s.strip()
    ]

device="cuda"
model = SentenceTransformer("BAAI/bge-m3", device=device)
threshold = 0.40


In [None]:
def chunk_sliding_window(
    sentences,
    window_size=5,
    stride=2
):
    chunks = []
    n = len(sentences)

    for start in range(0, n, stride):
        window = sentences[start:start + window_size]
        if len(window) < 2:
            break
        chunks.append(" ".join(window))

    return chunks

In [None]:
import json

with open('test_web.json', 'r') as f:
    val_web = json.load(f)

val_web_pages = []
for val_web_query in val_web:
    val_web_page = {}
    val_web_page['question'] = val_web_query['question']
    val_web_page['pages'] = val_web_query['info'].split('\n\n')
    val_web_pages.append(val_web_page)

In [None]:
from tqdm import tqdm
import numpy as np

def cosine(a, b):
    return np.dot(a, b)

val_selected_chunks = []

# ====== STAT ======
total_chunks = 0
selected_chunks_cnt = 0
# ==================

for val_web_page in tqdm(val_web_pages, desc="Web pages"):
    query = val_web_page['question']
    pages = val_web_page['pages']

    query_embedding = model.encode(
        query,
        normalize_embeddings=True
    )

    val_selected_chunk = {
        'question': query,
        'chunks_page': []
    }

    for page in pages:
        sentences = split_sentences(page)
        chunks = chunk_sliding_window(
            sentences,
            window_size=5,
            stride=2
        )

        total_chunks += len(chunks)

        if not chunks:
            val_selected_chunk['chunks_page'].append([])
            continue

        chunk_embeddings = model.encode(
            chunks,
            normalize_embeddings=True
        )

        sims = np.dot(chunk_embeddings, query_embedding)

        selected_chunks = []
        for chunk, sim in zip(chunks, sims):
            if sim > threshold:
                selected_chunks.append(chunk)
                selected_chunks_cnt += 1

        val_selected_chunk['chunks_page'].append(selected_chunks)

    val_selected_chunks.append(val_selected_chunk)



In [None]:
print(selected_chunks_cnt, total_chunks)

In [None]:
with open("test_selected_chunks.json", "w", encoding="utf-8") as f:
    json.dump(
        val_selected_chunks,
        f,
        ensure_ascii=False,
        indent=2
    )

In [None]:
!pip install faiss-cpu

In [None]:
!gdown 11eVbxCq8N-m2VrK5l2wLQNHetf8vCIoK

In [None]:
!du -sh ./*


In [None]:
import json
import faiss
import numpy as np

import numpy as np
import faiss


def safe_normalize(embeddings, tol=1e-3):
    """
    embeddings: np.ndarray (N, d)
    tol: sai s·ªë cho ph√©p so v·ªõi norm = 1
    """
    norms = np.linalg.norm(embeddings, axis=1)

    # Ki·ªÉm tra: c√≥ vector n√†o ch∆∞a norm kh√¥ng?
    need_norm = np.any(np.abs(norms - 1.0) > tol)

    if need_norm:
        faiss.normalize_L2(embeddings)
        print("üîÑ Embeddings were NOT normalized ‚Üí normalized now.")
    else:
        print("‚úÖ Embeddings already normalized ‚Üí skip.")

    return embeddings


def save_index_streaming(
    input_json_path,
    index_out_path,
    batch_size=4096,
):
    with open(input_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    if isinstance(data, dict):
        data = [data]

    dim = len(data[0]["embeddings"])
    index = faiss.IndexFlatIP(dim)

    buf = []

    for i, item in enumerate(data):
        buf.append(item["embeddings"])

        if len(buf) == batch_size:
            batch = np.array(buf, dtype="float32")
            index.add(batch)
            buf.clear()

            if i % (batch_size * 10) == 0:
                print(f"Added {index.ntotal} vectors")

    # add ph·∫ßn c√≤n l·∫°i
    if buf:
        batch = np.array(buf, dtype="float32")
        index.add(batch)

    faiss.write_index(index, index_out_path)
    print(f"Saved index: {index.ntotal} vectors")

save_index_streaming('embedded_chunks_web.json', 'embedded_chunks_web.index')