In [1]:
!pip install faiss-gpu==1.7.2

Collecting faiss-gpu==1.7.2
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [3]:
!pip install flash-attn==2.5.8 --no-build-isolation

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting flash-attn==2.5.8
  Downloading flash_attn-2.5.8.tar.gz (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting einops (from flash-attn==2.5.8)
  Downloading einops-0.8.1-py3-none-any.whl.metadata (13 kB)
Collecting ninja (from flash-attn==2.5.8)
  Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.1 kB)
Downloading einops-0.8.1-py3-none-any.whl (64 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.4/64.4 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (180 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m82.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ..

In [1]:
# -*- coding: utf-8 -*-
# KULLM3 + FAISS(HNSW) 빠른 RAG (고정폭 lookbehind 수정 포함)

import os, re, json, math
from typing import List, Tuple
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

PDF_PATH = "../data/개인정보 보호법.pdf"
LLM_ID = "nlpai-lab/KULLM3"
EMB_MODEL = "jhgan/ko-sroberta-multitask"   # 대체: intfloat/multilingual-e5-small
TOP_K = 4
CHUNK_TOKENS = 600
CHUNK_OVERLAP = 32
CTX_TOKEN_BUDGET = 900
SEED = 42

torch.manual_seed(SEED)

# 0) 경량 라우터 (정규식/키워드)
LAW_KWS = ("개인정보", "제", "조(", "시행령", "과징금", "처벌", "보안", "금융", "증권", "PPI", "CPI", "자본시장법")
def route_is_law(query: str) -> bool:
    q = query.lower()
    return any(kw in q for kw in LAW_KWS)

# ---------------- PDF 로드 & 정제 ----------------
from PyPDF2 import PdfReader

def clean_article_text(text: str) -> str:
    text = re.sub(r'[\u4e00-\u9fff]', '', text)
    text = re.sub(r'법제처\s+\d+\s+국가법령정보센터\s*개인정보\s*보호법', '', text)
    text = re.sub(r'법제처\s+\d+\s+국가법령정보센터', '', text)
    text = re.sub(r'국가법령정보센터\s*개인정보\s*보호법', '', text)
    text = re.sub(r'법제처|국가법령정보센터', '', text)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'\[[^\]]+\]', '', text)
    circled = '①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳'
    for idx, c in enumerate(circled, 1):
        text = text.replace(c, f'({idx})')
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\(\s*\)', '', text)
    return text

def load_pdf_text(pdf_path: str) -> str:
    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        t = page.extract_text() or ""
        full_text += t + "\n"
    return full_text

def split_articles(text: str) -> List[Tuple[str, str, str]]:
    pattern = r'(제\d+조(?:의\d+)?\([^)]+\))'
    parts = re.split(pattern, text)
    results = []
    for i in range(1, len(parts), 2):
        header = parts[i]
        body = (parts[i+1] if i+1 < len(parts) else "").strip().replace("\n", " ")
        m = re.match(r'(제\d+조(?:의\d+)?)[(]([^)]+)[)]', header)
        if not m: 
            continue
        art_id = m.group(1)
        title = m.group(2)
        results.append((art_id, title, clean_article_text(body)))
    return results


In [2]:
# ---------------- 토큰 기준 청킹 ----------------
llm_tokenizer = AutoTokenizer.from_pretrained(LLM_ID)
if llm_tokenizer.pad_token is None:
    llm_tokenizer.pad_token = llm_tokenizer.eos_token
llm_tokenizer.padding_side = "right"

def token_len(s: str) -> int:
    return len(llm_tokenizer(s, add_special_tokens=False)["input_ids"])

def split_sentences_ko(text: str) -> List[str]:
    """
    lookbehind 고정폭으로 분리:
    - '...다.' 패턴 뒤 공백
    - 일반 종결부호(. ? ! 。 ！ ？) 뒤 공백
    """
    text = re.sub(r'\s+', ' ', text).strip()
    if not text:
        return []
    return re.split(r'(?<=다\.)\s+|(?<=[.?!。！？])\s+', text)

def chunk_by_tokens(text: str, header: str, max_tokens=CHUNK_TOKENS, overlap=CHUNK_OVERLAP) -> List[str]:
    prefix = header.strip() + "\n"
    sents = split_sentences_ko(text)
    if not sents:
        sents = [text]
    chunks, cur, cur_toks = [], [], token_len(prefix)
    for s in sents:
        tl = token_len(s)
        if cur_toks + tl > max_tokens and cur:
            chunks.append(prefix + " ".join(cur))
            if overlap > 0:
                keep = cur[-1] if cur else ""
                cur = [keep] if keep else []
                cur_toks = token_len(prefix) + (token_len(keep) if keep else 0)
            else:
                cur, cur_toks = [], token_len(prefix)
        cur.append(s)
        cur_toks += tl
    if cur:
        chunks.append(prefix + " ".join(cur))
    return chunks




In [3]:
# ---------------- 임베딩 & FAISS(HNSW) ----------------
from langchain_community.embeddings import HuggingFaceEmbeddings
import numpy as np, faiss
from dataclasses import dataclass

embeddings = HuggingFaceEmbeddings(
    model_name=EMB_MODEL,
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    encode_kwargs={"normalize_embeddings": True, "batch_size": 128,
                   "convert_to_numpy": True, "convert_to_tensor": False}
)

@dataclass
class Doc:
    text: str
    meta: dict

def build_faiss_hnsw(vectors: np.ndarray, m: int = 32, ef_search: int = 64) -> faiss.IndexHNSWFlat:
    dim = vectors.shape[1]
    index = faiss.IndexHNSWFlat(dim, m)
    index.hnsw.efSearch = ef_search
    index.add(vectors.astype(np.float32))
    return index


  embeddings = HuggingFaceEmbeddings(
  return self.fget.__get__(instance, owner)()


In [4]:
# ---------------- 데이터 준비 ----------------
full_text = load_pdf_text(PDF_PATH)
articles = split_articles(full_text)

docs: List[Doc] = []
for art_id, title, body in articles:
    header = f"개인정보 보호법 {art_id}({title})"
    for ch in chunk_by_tokens(body, header, max_tokens=CHUNK_TOKENS, overlap=CHUNK_OVERLAP):
        docs.append(Doc(text=ch, meta={
            "article": art_id, "title": title, "tok_len": token_len(ch)  # ← 캐시
        }))

corpus_texts = [d.text for d in docs]
emb_matrix = np.array(embeddings.embed_documents(corpus_texts), dtype=np.float32)  # (N, D)

# index = build_faiss_hnsw(emb_matrix, m=32, ef_search=64)
index = build_faiss_hnsw(emb_matrix, m=32, ef_search=32)  # ← 64 -> 32 (보통 절반 가까이 빨라짐)


def faiss_search(query: str, top_k: int = TOP_K) -> List[Doc]:
    qv = np.array(embeddings.embed_query(query), dtype=np.float32).reshape(1, -1)
    D, I = index.search(qv, top_k)
    return [docs[int(i)] for i in I[0] if int(i) >= 0]

# pack_context에서 캐시 활용 + 잘라붙일 때만 토크나이즈
def pack_context(docs_in, token_budget=CTX_TOKEN_BUDGET):
    acc, used = [], 0
    for d in docs_in:
        tl = d.meta.get("tok_len", None)
        if tl is None:  # 혹시 없는 경우만 계산
            tl = token_len(d.text); d.meta["tok_len"] = tl
        if used + tl <= token_budget:
            acc.append(d.text); used += tl
        else:
            remain = token_budget - used
            if remain > 50:
                ids = llm_tokenizer(d.text, add_special_tokens=False)["input_ids"][:remain]
                acc.append(llm_tokenizer.decode(ids))
            break
    return "\n\n".join(acc)


In [5]:
# ---------------- LLM 로드 & 생성 ----------------
llm_model = AutoModelForCausalLM.from_pretrained(
    LLM_ID,
    device_map="auto",
    load_in_4bit=True,
    torch_dtype=torch.float16
)
if torch.cuda.is_available():
    torch.backends.cuda.matmul.allow_tf32 = True
    try:
        # llm_model.config.attn_implementation = "sdpa"
        llm_model.config.attn_implementation = "flash_attention_2"
        # llm_model.config.attn_implementation = "eager"
    except Exception:
        pass
llm_model.eval()
torch.set_grad_enabled(False)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

<torch.autograd.grad_mode.set_grad_enabled at 0x77d3cfc648b0>

In [6]:
# ---------------- 텍스트 생성 ----------------
def dynamic_max_new_tokens(question: str) -> int:
    lines = [ln.strip() for ln in question.split("\n") if ln.strip()]
    opt_cnt = sum(bool(re.match(r"^\d+(\s|[.)])", ln)) for ln in lines)
    return 128 if opt_cnt >= 2 else 256

SYSTEM_PROMPT = (
    "당신은 한국 법령, 금융, 보안 도메인 Q/A를 담당하는 도우미입니다. "
    "아는 사실만 간결하게 답하고, 모르면 '알 수 없습니다'라고 말하세요."
)
USER_TPL = (
    "다음 컨텍스트만 사용해 한국어로 정확하게 답하세요.\n"
    "===\n{context}\n===\n질문: {query}"
)

def faiss_search_with_scores(query: str, top_k: int = TOP_K):
    qv = np.array(embeddings.embed_query(query), dtype=np.float32).reshape(1, -1)
    D, I = index.search(qv, top_k)              # L2 거리 (정규화 벡터)
    cos = 1.0 - (D[0] / 2.0)                    # L2 -> cosine
    out = []
    for idx, i in enumerate(I[0]):
        if int(i) >= 0:
            out.append((docs[int(i)], float(cos[idx])))
    return out

def generate_answer(query: str) -> str:
    # (0) 라우팅: 법/금융/보안 질의가 아니면 검색 자체를 생략 → 즉시 베이스모델
    if not route_is_law(query):
        prompt = (
            "당신은 한국어로 간결하고 정확하게 답하는 도우미입니다. "
            "사실에 근거해 답하고, 모르면 '알 수 없습니다'라고 말하세요.\n\n"
            f"질문: {query}"
        )
        inputs = llm_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048, padding=False)
        inputs = {k: v.to(llm_model.device) for k, v in inputs.items()}
        with torch.inference_mode():
            out = llm_model.generate(**inputs,
                                     max_new_tokens=dynamic_max_new_tokens(query),
                                     do_sample=False, temperature=0.2,
                                     eos_token_id=llm_tokenizer.eos_token_id,
                                     pad_token_id=llm_tokenizer.pad_token_id)
        gen = out[0][inputs["input_ids"].shape[1]:]
        return llm_tokenizer.decode(gen, skip_special_tokens=True).strip()

    # (1) 벡터 검색 + 점수
    scored = faiss_search_with_scores(query, top_k=TOP_K)
    best_cos = max((s for _, s in scored), default=0.0)

    # (2) 임계치(튜닝 포인트): 0.65 → 0.70로 올리면 컨텍스트 사용 빈도↓ → 평균 지연↓
    THRESH = 0.70
    use_context = best_cos >= THRESH and len(scored) > 0

    # (3) 컨텍스트 조립 (캐시된 tok_len 사용)
    ctx = pack_context([d for d, _ in scored], token_budget=CTX_TOKEN_BUDGET) if use_context else ""

    # (4) 프롬프트 구성 (불필요한 장식 최소화)
    if use_context:
        prompt = (
            "아래 컨텍스트를 우선 사용해 정확히 답하세요. 불충분하면 아는 범위에서만 간결히 답하세요.\n\n"
            f"=== 컨텍스트 ===\n{ctx}\n=== 끝 ===\n"
            f"질문: {query}"
        )
        max_new = dynamic_max_new_tokens(query)
        max_len = 3072  # 입력 길이 상한도 줄여 토크나이즈 시간 단축
    else:
        prompt = (
            "당신은 한국 법령·금융·보안 Q/A 도우미입니다. "
            "사실에 근거해 간결히 답하고, 모르면 '알 수 없습니다'라고 말하세요.\n\n"
            f"질문: {query}"
        )
        max_new = dynamic_max_new_tokens(query)
        max_len = 2048

    # (5) 토크나이즈/생성
    inputs = llm_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_len, padding=False)
    inputs = {k: v.to(llm_model.device) for k, v in inputs.items()}
    with torch.inference_mode():
        out = llm_model.generate(
            **inputs,
            max_new_tokens=max_new,
            do_sample=False,
            temperature=0.2,
            eos_token_id=llm_tokenizer.eos_token_id,
            pad_token_id=llm_tokenizer.pad_token_id,
        )
    gen = out[0][inputs["input_ids"].shape[1]:]
    return llm_tokenizer.decode(gen, skip_special_tokens=True).strip()

In [7]:
q = """개인정보보호법 제22조의2에 따라 만 14세 미만 아동의 개인정보를 처리하기 위해 필요한 절차로 옳은 것은?
1 아동의 학교의 동의를 받아야 한다.
2 법정대리인의 동의를 받아야 한다.
3 아동 본인의 동의만 받으면 된다.
4 아동의 친구의 동의를 받아야 한다."""
print(generate_answer(q))




5
6
7
8
9
10
11
11
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
