In [8]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import json

# ============================
# D·ªØ li·ªáu m·∫´u
# ============================
product_list = [
    {"code": "M001", "name": "7 Up"},
    {"code": "M002", "name": "Sting"},
    {"code": "M003", "name": "Tiger B·∫°c Lon L·ªõn 330ml"},
    {"code": "M004", "name": "KhƒÉn l·∫°nh"},
    {"code": "M005", "name": "H√†u n∆∞·ªõng ph√¥ mai"},
    {"code": "M006", "name": "Ngh√™u h·∫•p s·∫£"},
    {"code": "M007", "name": "B√™ thui (b√≤ t∆°) (thƒÉn, qu·∫ø, b·∫Øp, g√π)"},
    {"code": "M008", "name": "B√°nh tr√°ng n∆∞·ªõng"},
    {"code": "M009", "name": "G·ªèi b√≤ t∆° b√≥p th·∫•u"},
]

input_product = [
    {"no": 1, "name": "7 Up", "unit": "Lon", "quantity": 2, "unit_price": 20000},
    {"no": 2, "name": "Sting", "unit": "Lon", "quantity": 1, "unit_price": 20000},
    {"no": 3, "name": "Tiger B·∫°c Lon L·ªõn 330ml", "unit": "Lon", "quantity": 2, "unit_price": 25000},
    {"no": 4, "name": "KhƒÉn l·∫°nh", "unit": "C√°i", "quantity": 2, "unit_price": 3000},
    {"no": 5, "name": "H√†u n∆∞·ªõng ph√¥ mai", "unit": "Con", "quantity": 3, "unit_price": 37000}
]

# ============================
# Model SentenceTransformer
# ============================
print("ƒêang load model SentenceTransformer...")
model = SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder')
print("Model loaded!")

texts = [f"{p['code']} {p['name']}" for p in product_list]
embeddings = model.encode(texts, normalize_embeddings=True)
dimension = embeddings.shape[1]

index = faiss.IndexFlatIP(dimension)
index.add(np.array(embeddings).astype('float32'))
print("FAISS index s·∫µn s√†ng!")


# ============================
# H√†m ch√≠nh
# ============================
def rag_products_cosine(product_list_input, top_k=1, threshold=0.6):
    # Reset flag
    for p in product_list:
        p["selected"] = False
        p["unit"] = ""
        p["quantity"] = 0
        p["unit_price"] = 0
        p["similarity"] = 0.0

    # D√≤ t√¨m match
    for item in product_list_input:
        name = item.get("name", "").strip()
        if not name:
            continue

        user_emb = model.encode([name], normalize_embeddings=True)
        D, I = index.search(np.array(user_emb).astype("float32"), k=top_k)

        best_idx = int(I[0][0])
        best_sim = float(D[0][0])
        best_product = product_list[best_idx]

        print(f"\nüîπ OCR: {name}")
        print(f"- Match: {best_product['name']} (similarity={best_sim:.4f})")

        if best_sim >= threshold:
            best_product["selected"] = True
            best_product["unit"] = item.get("unit", "")
            best_product["quantity"] = item.get("quantity", 0)
            best_product["unit_price"] = item.get("unit_price", 0)
            best_product["similarity"] = round(best_sim, 4)

    # Xu·∫•t to√†n b·ªô danh s√°ch product list
    json_output = json.dumps(product_list, ensure_ascii=False, indent=4)
    print("\n Danh s√°ch product_list sau khi match:")
    print(json_output)

    return json_output


# ============================
# G·ªçi h√†m test
# ============================
rag_products_cosine(input_product, threshold=0.5)


ƒêang load model SentenceTransformer...
Model loaded!
FAISS index s·∫µn s√†ng!

üîπ OCR: 7 Up
- Match: 7 Up (similarity=0.6078)

üîπ OCR: Sting
- Match: Sting (similarity=0.2572)

üîπ OCR: Tiger B·∫°c Lon L·ªõn 330ml
- Match: Tiger B·∫°c Lon L·ªõn 330ml (similarity=0.9196)

üîπ OCR: KhƒÉn l·∫°nh
- Match: KhƒÉn l·∫°nh (similarity=0.6415)

üîπ OCR: H√†u n∆∞·ªõng ph√¥ mai
- Match: H√†u n∆∞·ªõng ph√¥ mai (similarity=0.7652)

 Danh s√°ch product_list sau khi match:
[
    {
        "code": "M001",
        "name": "7 Up",
        "selected": true,
        "unit": "Lon",
        "quantity": 2,
        "unit_price": 20000,
        "similarity": 0.6078
    },
    {
        "code": "M002",
        "name": "Sting",
        "selected": false,
        "unit": "",
        "quantity": 0,
        "unit_price": 0,
        "similarity": 0.0
    },
    {
        "code": "M003",
        "name": "Tiger B·∫°c Lon L·ªõn 330ml",
        "selected": true,
        "unit": "Lon",
        "quantity": 2,
     

'[\n    {\n        "code": "M001",\n        "name": "7 Up",\n        "selected": true,\n        "unit": "Lon",\n        "quantity": 2,\n        "unit_price": 20000,\n        "similarity": 0.6078\n    },\n    {\n        "code": "M002",\n        "name": "Sting",\n        "selected": false,\n        "unit": "",\n        "quantity": 0,\n        "unit_price": 0,\n        "similarity": 0.0\n    },\n    {\n        "code": "M003",\n        "name": "Tiger B·∫°c Lon L·ªõn 330ml",\n        "selected": true,\n        "unit": "Lon",\n        "quantity": 2,\n        "unit_price": 25000,\n        "similarity": 0.9196\n    },\n    {\n        "code": "M004",\n        "name": "KhƒÉn l·∫°nh",\n        "selected": true,\n        "unit": "C√°i",\n        "quantity": 2,\n        "unit_price": 3000,\n        "similarity": 0.6415\n    },\n    {\n        "code": "M005",\n        "name": "H√†u n∆∞·ªõng ph√¥ mai",\n        "selected": true,\n        "unit": "Con",\n        "quantity": 3,\n        "unit_price": 3