In [None]:
import os
import re
import time
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

# OpenAI互換クライアント
from openai import OpenAI

# ========= 設定 =========
BASE_URL = "http://localhost:1234/v1"  # LM Studio Local Server
API_KEY  = "lm-studio"                # 何でもOK
MODEL_ID = "text-embedding-mxbai-embed-large-v1"  # ★スクショのAPI identifier

INPUT_CSV = "コマンド推論用_1000_chunked.csv"
TEXT_COL  = "chunk_text"

INDEX_ROOT = Path("index")            # ここ配下にモデル別フォルダを作る
BATCH_SIZE = 64                       # 重い/落ちるなら 16/32
SLEEP_SEC  = 0.0                      # 混むなら 0.02~0.1
NORMALIZE_L2 = True                   # cosine検索するなら True 推奨
# =======================

def slugify(s: str) -> str:
    s = s.strip()
    s = re.sub(r"[^\w\-\.]+", "_", s)  # 記号を _ に
    s = re.sub(r"_+", "_", s)
    return s

def l2_normalize(mat: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    norms = np.linalg.norm(mat, axis=1, keepdims=True)
    return mat / np.maximum(norms, eps)

def embed_with_lmstudio(texts, client: OpenAI, model: str, batch_size: int = 64):
    all_vecs = []
    times = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]

        t0 = time.time()
        resp = client.embeddings.create(model=model, input=batch)
        elapsed = time.time() - t0

        # 入力順に返る
        vecs = [np.array(d.embedding, dtype=np.float32) for d in resp.data]
        all_vecs.extend(vecs)
        times.append(elapsed)

        if SLEEP_SEC:
            time.sleep(SLEEP_SEC)

    embs = np.stack(all_vecs)  # (N, dim)
    return embs, times

# ========= 実行 =========
df_chunks = pd.read_csv(INPUT_CSV)
assert TEXT_COL in df_chunks.columns, f"{TEXT_COL} 列が見つかりません: {list(df_chunks.columns)}"

texts = df_chunks[TEXT_COL].fillna("").astype(str).tolist()

client = OpenAI(base_url=BASE_URL, api_key=API_KEY)

print(f"モデル: {MODEL_ID}")
print(f"総チャンク数: {len(texts)}（batch={BATCH_SIZE}）")

embs, times = embed_with_lmstudio(texts, client, MODEL_ID, batch_size=BATCH_SIZE)

print("\n--- 集計 ---")
print(f"embeddings shape: {embs.shape}")
print(f"平均: {np.mean(times):.3f} 秒 / バッチ")
print(f"最小: {np.min(times):.3f} 秒, 最大: {np.max(times):.3f} 秒")
print(f"総処理時間: {np.sum(times):.2f} 秒")

if NORMALIZE_L2:
    embs = l2_normalize(embs).astype(np.float32)
    print("L2正規化: ON（cosine用）")

# ========= 保存（モデル別フォルダ） =========
model_dir = INDEX_ROOT / slugify(MODEL_ID)
model_dir.mkdir(parents=True, exist_ok=True)

emb_path = model_dir / "embeddings.npy"
chunks_path = model_dir / "chunks.parquet"
meta_path = model_dir / "meta.txt"

np.save(emb_path, embs)
df_chunks.to_parquet(chunks_path, index=False)

with open(meta_path, "w", encoding="utf-8") as f:
    f.write(f"model_id={MODEL_ID}\n")
    f.write(f"input_csv={INPUT_CSV}\n")
    f.write(f"text_col={TEXT_COL}\n")
    f.write(f"batch_size={BATCH_SIZE}\n")
    f.write(f"normalize_l2={NORMALIZE_L2}\n")
    f.write(f"num_chunks={len(texts)}\n")
    f.write(f"dim={embs.shape[1]}\n")

print(f"\n保存しました: {emb_path}")
print(f"保存しました: {chunks_path}")
print(f"保存しました: {meta_path}")


モデル: text-embedding-mxbai-embed-large-v1
総チャンク数: 12239（batch=64）


100%|██████████| 192/192 [42:16<00:00, 13.21s/it]


--- 集計 ---
embeddings shape: (12239, 1024)
平均: 13.208 秒 / バッチ
最小: 2.133 秒, 最大: 17.043 秒
総処理時間: 2536.02 秒
L2正規化: ON（cosine用）

保存しました: index/text-embedding-mxbai-embed-large-v1/embeddings.npy
保存しました: index/text-embedding-mxbai-embed-large-v1/chunks.parquet
保存しました: index/text-embedding-mxbai-embed-large-v1/meta.txt



