In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install sentence-transformers faiss-cpu pillow numpy tqdm

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m23.8/23.8 MB[0m [31m112.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2


In [3]:
import os
import json
import torch
import requests
import faiss
import numpy as np
from PIL import Image
from sentence_transformers import SentenceTransformer
from collections import defaultdict

In [4]:
CATEGORY_MAP = ["ÏÉÅÏùò", "Î∞îÏßÄ", "ÏïÑÏö∞ÌÑ∞", "Ïã†Î∞ú", "Í∞ÄÎ∞©"]

In [5]:
from sentence_transformers import SentenceTransformer
import torch

def load_model():
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"‚û°Ô∏è DEVICE: {DEVICE}")

    text_model = SentenceTransformer(
        "upskyy/bge-m3-korean",
        device=DEVICE
    )
    text_model.eval()
    print("ü§ñ Text model (bge-m3) loaded successfully!")

    return text_model


def _flush_batch(text_model, texts, metas, cats, buffers):
    text_vecs = text_model.encode(
        texts,
        convert_to_numpy=True,
        normalize_embeddings=True,
    )

    for tv, m, c in zip(text_vecs, metas, cats):
        buffers[c].append({
            "vector": tv,
            "metadata": m,
        })

def load_all_items(jsonl_dir):
    items = []

    for fname in os.listdir(jsonl_dir):
        if not fname.endswith(".jsonl"):
            continue

        path = os.path.join(jsonl_dir, fname)
        print(f"üìÇ Loading {path}")

        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                items.append(json.loads(line))

    print(f"‚úÖ Ï¥ù {len(items)}Í∞ú ÏïÑÏù¥ÌÖú Î°úÎìú ÏôÑÎ£å\n")
    return items

In [6]:
def build_style_db(
    jsonl_dir,
    output_root: str,
    batch_size: int = 32,
):
    os.makedirs(output_root, exist_ok=True)

    text_model = load_model()
    items = load_all_items(jsonl_dir)

    buffers = defaultdict(list)

    text_batch, meta_batch, cat_batch = [], [], []

    print(f"üîó Ï¥ù {len(items)}Í∞ú ÏïÑÏù¥ÌÖú Ï≤òÎ¶¨ ÏãúÏûë\n")

    for idx, item in enumerate(items):
        print(f"‚û°Ô∏è {idx+1}/{len(items)} Ï≤òÎ¶¨ Ï§ë")

        main_cat = item.get("main_cat_name")
        if main_cat not in CATEGORY_MAP:
            continue

        # ---------------------------
        # üî• ÏûÑÎ≤†Îî© ÌÖçÏä§Ìä∏: tpo, mood
        # ---------------------------
        tpo = item.get("tpo")
        mood = item.get("mood")
        text_input = tpo + ", " + mood
        print(f"text_input: {text_input}")

        # ---------------------------
        # üî• metadata (ÎÑ§Í∞Ä Ï§Ä Ïä§ÌÇ§Îßà Í∑∏ÎåÄÎ°ú)
        # ---------------------------

        metadata = {
            "product_id": item.get("product_id"),
            "brand": item.get("brand"),
            "gender": item.get("gender"),
            "price": item.get("price"),
            "price_raw": item.get("price_raw"),
            "main_cat_name": item.get("main_cat_name"),
            "sub_cat_name": item.get("sub_cat_name"),
            "style": item.get("style_name"),
            "texture": item.get("texture_name"),
            "pattern": item.get("pattern_name"),
            "fit": item.get("fit_name"),
            "mood": item.get("mood"),
            "tpo": item.get("tpo"),
            "color": item.get("ÏÉâÏÉÅ"),
            "img_url": item.get("img_url"),
            "description": item.get("description")
        }

        text_batch.append(text_input)
        meta_batch.append(metadata)
        cat_batch.append(main_cat)

        # ---------- batch encode ----------
        if len(text_batch) == batch_size:
            _flush_batch(
                text_model,
                text_batch,
                meta_batch,
                cat_batch,
                buffers,
            )
            text_batch, meta_batch, cat_batch = [], [], []

    # leftover
    if text_batch:
        _flush_batch(
            text_model,
            text_batch,
            meta_batch,
            cat_batch,
            buffers,
        )

    # ---------- FAISS Ï†ÄÏû• ----------
    for cat_key, entries in buffers.items():
        print(f"\nüíæ Saving category: {cat_key} ({len(entries)} items)")

        vectors = np.array([e["vector"] for e in entries]).astype("float32")
        metas = [e["metadata"] for e in entries]

        index = faiss.IndexFlatIP(vectors.shape[1])
        index.add(vectors)

        save_dir = os.path.join(output_root, cat_key)
        os.makedirs(save_dir, exist_ok=True)

        faiss.write_index(index, f"{save_dir}/index.faiss")

        with open(f"{save_dir}/metadata.jsonl", "w", encoding="utf-8") as f:
            for m in metas:
                f.write(json.dumps(m, ensure_ascii=False) + "\n")

        print(f"‚úÖ {cat_key} ÏôÑÎ£å")

    print("\nüéâ TPO text-only DB Íµ¨Ï∂ï ÏôÑÎ£å!")


In [7]:
def main():
    jsonl_dir = "/content/drive/MyDrive/·Ñë·Ö≥·ÑÜ·Ö¶ 10·Ñê·Öµ·Ü∑/final_db_color"
    output_root = "/content/drive/MyDrive/ÌîÑÎ©î 10ÌåÄ/ÎØºÏÑú/faiss/tpo"
    build_style_db(jsonl_dir=jsonl_dir, output_root=output_root,batch_size=32)

if __name__ == "__main__":
    main()

‚û°Ô∏è DEVICE: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

ü§ñ Text model (bge-m3) loaded successfully!
üìÇ Loading /content/drive/MyDrive/·Ñë·Ö≥·ÑÜ·Ö¶ 10·Ñê·Öµ·Ü∑/final_db_color/·Ñã·Öß_·ÑÄ·Ö°·Ñá·Ö°·Üº_color.jsonl
üìÇ Loading /content/drive/MyDrive/·Ñë·Ö≥·ÑÜ·Ö¶ 10·Ñê·Öµ·Ü∑/final_db_color/·Ñã·Öß_·Ñâ·Öµ·Ü´·Ñá·Ö°·ÜØ_color.jsonl
üìÇ Loading /content/drive/MyDrive/·Ñë·Ö≥·ÑÜ·Ö¶ 10·Ñê·Öµ·Ü∑/final_db_color/·Ñã·Öß_·Ñã·Ö°·Ñã·ÖÆ·Ñê·Ö•_color.jsonl
üìÇ Loading /content/drive/MyDrive/·Ñë·Ö≥·ÑÜ·Ö¶ 10·Ñê·Öµ·Ü∑/final_db_color/·Ñã·Öß_·Ñá·Ö°·Ñå·Öµ_color.jsonl
üìÇ Loading /content/drive/MyDrive/·Ñë·Ö≥·ÑÜ·Ö¶ 10·Ñê·Öµ·Ü∑/final_db_color/·Ñã·Öß_·Ñâ·Ö°·Üº·Ñã·Ö¥_color.jsonl
üìÇ Loading /content/drive/MyDrive/·Ñë·Ö≥·ÑÜ·Ö¶ 10·Ñê·Öµ·Ü∑/final_db_color/·ÑÇ·Ö°·Ü∑_·Ñâ·Ö°·Üº·Ñã·Ö¥_color.jsonl
üìÇ Loading /content/drive/MyDrive/·Ñë·Ö≥·ÑÜ·Ö¶ 10·Ñê·Öµ·Ü∑/final_db_color/·ÑÇ·Ö°·Ü∑_·ÑÄ·Ö°·Ñá·Ö°·Üº_color.jsonl
üìÇ Loading /content/drive/MyDrive/·Ñë·Ö≥·ÑÜ·Ö¶ 10·Ñê·Öµ·Ü∑/final_db_color/·ÑÇ·Ö°·Ü∑_·Ñá·Ö°·Ñå·Öµ_color.jsonl
üìÇ Loading /content/drive/MyDrive/·Ñë·Ö≥·ÑÜ·Ö¶ 10·Ñê·Öµ

In [8]:
# ÌôïÏù∏

import faiss
import json
import os
import numpy as np

def verify_category_db(category_path):
    print(f"üîç Í≤ÄÏ¶ù ÏãúÏûë: {category_path}")

    index_path = os.path.join(category_path, "index.faiss")
    meta_path = os.path.join(category_path, "metadata.jsonl")

    if not os.path.exists(index_path) or not os.path.exists(meta_path):
        print("‚ùå ÏóêÎü¨: .faiss ÎòêÎäî .jsonl ÌååÏùºÏù¥ Ï°¥Ïû¨ÌïòÏßÄ ÏïäÏäµÎãàÎã§.")
        return

    # 1. FAISS Ïù∏Îç±Ïä§ Î°úÎìú
    index = faiss.read_index(index_path)
    total_vectors = index.ntotal

    # 2. Î©îÌÉÄÎç∞Ïù¥ÌÑ∞ Î°úÎìú
    metas = []
    with open(meta_path, "r", encoding="utf-8") as f:
        for line in f:
            metas.append(json.loads(line))
    total_metas = len(metas)

    # 3. Í∞úÏàò ÏùºÏπò ÌôïÏù∏
    print(f"üìä Ï¥ù Î≤°ÌÑ∞ Ïàò: {total_vectors}")
    print(f"üìä Ï¥ù Î©îÌÉÄÎç∞Ïù¥ÌÑ∞ Ïàò: {total_metas}")

    if total_vectors == total_metas:
        print("‚úÖ [ÏÑ±Í≥µ] Î≤°ÌÑ∞ÏôÄ Î©îÌÉÄÎç∞Ïù¥ÌÑ∞Ïùò Í∞úÏàòÍ∞Ä ÏùºÏπòÌï©ÎãàÎã§.")
    else:
        print(f"‚ùå [Ïã§Ìå®] Í∞úÏàò Î∂àÏùºÏπò! (Ï∞®Ïù¥: {abs(total_vectors - total_metas)})")

    # 4. Îç∞Ïù¥ÌÑ∞ ÏÉòÌîåÎßÅ ÌôïÏù∏ (product_id ÌôïÏù∏)
    if total_metas > 0:
        print("\n--- ÏÉÅÏúÑ 3Í∞ú Îç∞Ïù¥ÌÑ∞ ÏÉòÌîå ---")
        for i in range(min(3, total_metas)):
            p_id = metas[i].get("product_id", "N/A")
            print(f"Index [{i}]: ID={p_id}")

    return total_vectors == total_metas

# Ïã§Ìñâ ÏòàÏãú (ÏïÑÏö∞ÌÑ∞ Ïπ¥ÌÖåÍ≥†Î¶¨ ÌôïÏù∏ Ïãú)
verify_category_db("/content/drive/MyDrive/ÌîÑÎ©î 10ÌåÄ/ÎØºÏÑú/faiss/tpo/Ïã†Î∞ú")

üîç Í≤ÄÏ¶ù ÏãúÏûë: /content/drive/MyDrive/ÌîÑÎ©î 10ÌåÄ/ÎØºÏÑú/faiss/tpo/Ïã†Î∞ú
üìä Ï¥ù Î≤°ÌÑ∞ Ïàò: 155
üìä Ï¥ù Î©îÌÉÄÎç∞Ïù¥ÌÑ∞ Ïàò: 155
‚úÖ [ÏÑ±Í≥µ] Î≤°ÌÑ∞ÏôÄ Î©îÌÉÄÎç∞Ïù¥ÌÑ∞Ïùò Í∞úÏàòÍ∞Ä ÏùºÏπòÌï©ÎãàÎã§.

--- ÏÉÅÏúÑ 3Í∞ú Îç∞Ïù¥ÌÑ∞ ÏÉòÌîå ---
Index [0]: ID=5627850
Index [1]: ID=5627844
Index [2]: ID=4633774


True