In [3]:
import os
import json

import pandas as pd
import numpy as np

import torch
from transformers import AutoTokenizer, AutoModel
import faiss

In [4]:
# 데이터 불러오기
import os, json
import pandas as pd

def load_all_clauses_from_dir(root_dir):
    records = []
    for label in ("유리", "불리"):
        folder = os.path.join(root_dir, label)
        for fname in os.listdir(folder):
            if not fname.endswith(".json"):
                continue
            path = os.path.join(folder, fname)
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)
            text = " ".join(data.get("clauseArticle", []))
            basis = " ".join(data.get("illdcssBasiss", [])) if label == "불리" else None
            records.append({
                "filename": fname,
                "label": label,
                "text": text,
                "basis": basis
            })
    return pd.DataFrame(records)

df = load_all_clauses_from_dir("../data/raw")

In [17]:
df.to_csv('labled.csv', index=False, encoding='utf-8-sig')

In [18]:
# 언어 모델 불러오기
import torch
from transformers import BertModel, AutoTokenizer

MODEL_NAME = "skt/kobert-base-v1"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
model = BertModel.from_pretrained(MODEL_NAME)
model.eval()

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(8002, 768, padding_idx=1)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)

In [22]:
import torch
import numpy as np

def embed_texts(
    texts,
    model,
    tokenizer,
    batch_size: int = 16,
    max_length: int = None,
    device: torch.device = None
) -> np.ndarray:
    """
    Embed a list of texts into vector representations using the [CLS] token embedding.
    
    Args:
        texts (List[str]): Input texts to embed.
        model (torch.nn.Module): Pretrained Transformer model.
        tokenizer (transformers.PreTrainedTokenizer): Corresponding tokenizer.
        batch_size (int): Number of samples per batch.
        max_length (int, optional): Maximum token length. Defaults to model.config.max_position_embeddings.
        device (torch.device, optional): Device for inference. Defaults to CUDA if available else CPU.

    Returns:
        np.ndarray: Array of shape (len(texts), hidden_size) with embeddings.
    """
    # Determine max_length if not provided
    if max_length is None:
        max_length = model.config.max_position_embeddings

    # Determine device
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Prepare model
    model = model.to(device)
    model.eval()

    all_embeds = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i : i + batch_size]
            encoded = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors="pt"
            )
            encoded = {k: v.to(device) for k, v in encoded.items()}

            outputs = model(**encoded)
            # Extract [CLS] token embedding (first token)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            all_embeds.append(cls_embeddings.cpu().numpy())

    return np.vstack(all_embeds)


# ===================== Usage Example =====================
if __name__ == "__main__":
    from transformers import AutoTokenizer, AutoModel

    # Load tokenizer and model
    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Sample texts
    texts = [
        "Hello world!",
        "This is a test sentence to embed.",
        "여러 문장을 한 번에 임베딩해 봅니다."
    ]

    # Generate embeddings
    embeddings = embed_texts(
        texts=texts,
        model=model,
        tokenizer=tokenizer,
        batch_size=2
    )

    # Display shapes and a sample
    print(f"Embeddings shape: {embeddings.shape}")  # (3, hidden_size)
    print(f"First embedding vector (truncated): {embeddings[0][:5]}...")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Embeddings shape: (3, 768)
First embedding vector (truncated): [-0.14241247  0.13353735 -0.12907091 -0.17164774 -0.48322865]...


In [20]:
# FAISS 인덱스
def build_faiss_index(embeddings: np.ndarray) -> faiss.Index:
    """
    embeddings: (N, D) float32 numpy array
    returns: FAISS index for cosine similarity
    """
    faiss.normalize_L2(embeddings)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatIP(dim)
    index.add(embeddings)
    return index


In [None]:
# 모델이 지원하는 최대 위치 임베딩 길이
# 왜 뻑나는지 확인
max_len = model.config.max_position_embeddings  
max_seq = 0
max_idx = None

for i, text in enumerate(df["text"].fillna("").astype(str)):
    enc = tokenizer(text, return_tensors="pt", truncation=False, padding=False)
    seq_len = enc["input_ids"].size(1)
    if seq_len > max_seq:
        max_seq, max_idx = seq_len, i

print(f"dataset max seq_len = {max_seq} (at idx {max_idx})")
print(f"model.max_position_embeddings = {max_len}")


dataset max seq_len = 993 (at idx 6339)
model.max_position_embeddings = 512


In [None]:
# 인덱스 구축
index = build_faiss_index(embeddings)
print("FAISS index total vectors:", index.ntotal)

# 첫 번째 항목으로 Top-5 검색
D, I = index.search(embeddings[:1], k=5)
for rank, idx in enumerate(I[0]):
    sim   = D[0][rank]
    label = df.loc[idx, "label"]
    text  = df.loc[idx, "text"][:50] + "…"
    basis = df.loc[idx, "basis"][:50] + "…" if df.loc[idx, "basis"] else ""
    print(f"Rank {rank+1}: (label={label}, sim={sim:.4f})")
    print(f"  text : {text}")
    if basis:
        print(f"  basis: {basis}")
    print()


In [None]:
from sentence_transformers import SentenceTransformer
# 뻑나는 함수 대신 다른 모델 가져와서 임베딩함

# 가벼운 CPU용 모델로 불러오기
sbert = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

# 텍스트 리스트 준비
texts = df["text"].fillna("").astype(str).tolist()

# 배치 사이즈 32로, 진행바 띄워가며 임베딩 생성
embeddings = sbert.encode(
    texts,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True
)

# float32 변환 후 DataFrame에 추가
df["embedding"] = embeddings.astype("float32").tolist()

print("임베딩 shape:", embeddings.shape)


NameError: name 'df' is not defined