In [6]:
import pandas as pd

df = pd.read_parquet("korean_precedents_clean.parquet")


In [7]:
df.columns


Index(['판례정보일련번호', '사건번호', '사건명', '법원명', '사건종류명', '판결유형', '선고일자_norm', '참조조문',
       'case_text', 'text_length'],
      dtype='object')

In [8]:
def safe_text(x):
    if pd.isna(x):
        return ""
    return str(x).strip()


In [11]:
df["fact_text"] = df["case_text"].apply(safe_text)


In [12]:
df = df[[
    "판례정보일련번호",
    "사건번호",
    "사건명",
    "법원명",
    "사건종류명",
    "판결유형",
    "선고일자_norm",
    "참조조문",
    "fact_text"
]]


In [13]:
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm

model = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)


In [14]:
def embed_texts(texts, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        emb = model.encode(batch, show_progress_bar=False)
        embeddings.append(emb)
    return np.vstack(embeddings)


In [15]:
fact_embeddings = embed_texts(df["fact_text"].tolist())


100%|██████████| 2543/2543 [52:10<00:00,  1.23s/it]


In [16]:
np.save("case_embeddings.npy", fact_embeddings)


In [17]:
import faiss

dim = fact_embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(fact_embeddings)

faiss.write_index(index, "case_index.faiss")
