In [None]:
import json
import torch
import numpy as np
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
from google.colab import drive

# =============================================================================
# [0] 구글 드라이브 마운트 및 경로 설정
# =============================================================================
drive.mount('/content/drive')

# 메인 작업 경로 설정
WORK_DIR = "/content/drive/MyDrive/P02_SemanticParsing"
BASE_DIR = f"{WORK_DIR}/nia"

# 작업경로 확인
print(f"작업 경로: {WORK_DIR}")


Mounted at /content/drive
작업 경로: /content/drive/MyDrive/P02_SemanticParsing


In [None]:
import json
import os
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses, models
from tqdm.auto import tqdm

# =============================================================================
# [1] 설정 및 데이터 경로
# =============================================================================
WORK_DIR = "/content/drive/MyDrive/P02_SemanticParsing"
BASE_DIR = f"{WORK_DIR}/nia"
TRAIN_PATH = f"{BASE_DIR}/train.json"  # 학습용 질문 데이터
TABLE_PATH = f"{BASE_DIR}/tables.json" # 테이블 정보
OUTPUT_PATH = f"{WORK_DIR}/saved_models_final/retriever_finetuned_v2" # 저장 경로

# 배치 사이즈와 에폭 (데이터 양에 따라 조절)
BATCH_SIZE = 32
NUM_EPOCHS = 4

# =============================================================================
# [2] 스키마 텍스트 구성 함수 (Serialization) - 핵심!
# =============================================================================
def make_schema_text(t):
    """
    테이블 정보를 검색에 최적화된 텍스트로 변환합니다.
    형식: [한글명] (영문명) | [설명] | 컬럼: [컬럼리스트]
    """
    # 1. 테이블 이름 (한글/영문 병기)
    t_name_ko = t.get('table_names_original', "")
    if isinstance(t_name_ko, list): t_name_ko = t_name_ko[0]

    t_name_en = t.get('table_names', "")
    if isinstance(t_name_en, list): t_name_en = t_name_en[0]

    # 2. 컬럼 이름 리스트
    c_names = [c[1] for c in t.get('column_names', [])]
    c_text = ", ".join(c_names)

    # 3. 텍스트 조합
    # 팁: 만약 테이블에 'description' 필드가 있다면 여기에 추가하세요.
    schema_text = f"테이블: {t_name_ko} ({t_name_en}) | 컬럼: {c_text}"

    return schema_text

# =============================================================================
# [3] 데이터 로드 및 전처리
# =============================================================================
print("📂 데이터 로딩 중...")

# 1. 테이블 정보 로드 및 텍스트 변환
with open(TABLE_PATH, 'r', encoding='utf-8') as f:
    table_list = json.load(f)

table_dict = {}
for t in table_list:
    table_dict[t['db_id']] = make_schema_text(t)

# 2. 학습 데이터(질문-정답) 구성
train_examples = []
with open(TRAIN_PATH, 'r', encoding='utf-8') as f:
    train_data = json.load(f)

print(f"🚀 학습 데이터 생성 중 (총 {len(train_data)}건)...")
skipped = 0
for item in tqdm(train_data):
    question = item['question']
    gold_id = item['db_id']

    # 정답 테이블의 텍스트를 가져옴 (Positive Sample)
    if gold_id in table_dict:
        positive_text = table_dict[gold_id]
        # InputExample: [질문, 정답문서]
        train_examples.append(InputExample(texts=[question, positive_text]))
    else:
        skipped += 1

print(f"ℹ️ 데이터 준비 완료 (Skipped: {skipped})")

# =============================================================================
# [4] 모델 학습 (Contrastive Learning)
# =============================================================================
# Base 모델 로드 (다국어 지원 모델)
base_model_name = "paraphrase-multilingual-MiniLM-L12-v2"
model = SentenceTransformer(base_model_name)

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)

# Loss 함수: MultipleNegativesRankingLoss
# 배치 내의 다른 샘플들을 자동으로 Negative로 사용하여 학습 효율 극대화
train_loss = losses.MultipleNegativesRankingLoss(model)

print(f"🔥 학습 시작: {base_model_name} -> Fine-tuning...")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=NUM_EPOCHS,
    warmup_steps=int(len(train_dataloader) * 0.1),
    show_progress_bar=True,
    output_path=OUTPUT_PATH
)

print(f"✅ 모델 저장 완료: {OUTPUT_PATH}")

📂 데이터 로딩 중...
🚀 학습 데이터 생성 중 (총 88946건)...


  0%|          | 0/88946 [00:00<?, ?it/s]

ℹ️ 데이터 준비 완료 (Skipped: 0)


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔥 학습 시작: paraphrase-multilingual-MiniLM-L12-v2 -> Fine-tuning...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 1


[34m[1mwandb[0m: You chose 'Create a W&B account'
[34m[1mwandb[0m: Create an account here: https://wandb.ai/authorize?signup=true&ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmistyclawn-dev[0m ([33mmistyclawn-dev-likelion[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,1.07
1000,0.6003
1500,0.5302
2000,0.4952
2500,0.4549
3000,0.4118
3500,0.3892
4000,0.3874
4500,0.3818
5000,0.371


✅ 모델 저장 완료: /content/drive/MyDrive/P02_SemanticParsing/saved_models_final/retriever_finetuned_v2
이제 2단계 평가 코드를 실행하세요.


In [None]:
!pip install rank_bm25
import json
import torch
import os
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi

# =============================================================================
# [1] 설정
# =============================================================================
WORK_DIR = "/content/drive/MyDrive/P02_SemanticParsing"
BASE_DIR = f"{WORK_DIR}/nia"
# ★ 중요: 방금 학습시킨 모델 경로 지정
RETRIEVER_MODEL_PATH = f"{WORK_DIR}/saved_models_final/retriever_finetuned_v2"
RERANKER_MODEL_PATH = f"{WORK_DIR}/saved_models_final/epoch_5" # 기존 Reranker 유지
VALID_PATH = f"{BASE_DIR}/valid.json"
TABLE_PATH = f"{BASE_DIR}/tables.json"

TEST_LIMIT = 500 # 전체 평가 시 None 또는 len(valid_data)
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"🚀 [New Hybrid] Tuned-SBERT + BM25 -> Neural Reranker")

# =============================================================================
# [2] 모델 로드
# =============================================================================
# 1. Fine-tuned Retriever 로드
if os.path.exists(RETRIEVER_MODEL_PATH):
    print(f"✅ 학습된 Retriever 모델 로드: {RETRIEVER_MODEL_PATH}")
    vector_model = SentenceTransformer(RETRIEVER_MODEL_PATH)
else:
    print("⚠️ 학습된 모델이 없습니다. Base 모델을 사용합니다.")
    vector_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# 2. Reranker 로드 (기존 동일)
tokenizer = AutoTokenizer.from_pretrained(RERANKER_MODEL_PATH)
reranker_model = AutoModelForSequenceClassification.from_pretrained(RERANKER_MODEL_PATH).to(device).eval()

# =============================================================================
# [3] 데이터 로드 및 인덱싱 (스키마 보강 적용)
# =============================================================================
with open(TABLE_PATH, 'r', encoding='utf-8') as f:
    table_list = json.load(f)
    all_tables = {t['db_id']: t for t in table_list}

# 스키마 텍스트 생성 함수 (학습 코드와 동일하게 유지)
def make_schema_text(t):
    t_name_ko = t.get('table_names_original', "")
    if isinstance(t_name_ko, list): t_name_ko = t_name_ko[0]
    t_name_en = t.get('table_names', "")
    if isinstance(t_name_en, list): t_name_en = t_name_en[0]
    c_names = [c[1] for c in t.get('column_names', [])]
    c_text = ", ".join(c_names)
    # 검색용 텍스트 (Document)
    return f"테이블: {t_name_ko} ({t_name_en}) | 컬럼: {c_text}"

print("📂 인덱싱 데이터 구축 중...")
bm25_corpus = []
vector_corpus = []
db_ids = []

def bigram_tokenizer(text):
    text = str(text).replace(" ", "")
    return [text[i:i+2] for i in range(len(text)-1)]

for t in tqdm(table_list, desc="Indexing"):
    # 1. Vector용 텍스트 (보강된 스키마 사용)
    rich_text = make_schema_text(t)
    vector_corpus.append(rich_text)

    # 2. BM25용 텍스트 (키워드 반복 강화)
    # 한글명, 영문명, 컬럼명을 공백으로 나열
    t_name_ko = t.get('table_names_original', "")
    if isinstance(t_name_ko, list): t_name_ko = t_name_ko[0]
    t_name_en = t.get('table_names', "")
    if isinstance(t_name_en, list): t_name_en = t_name_en[0]
    c_names = [c[1] for c in t.get('column_names', [])]

    bm25_text = f"{t_name_ko} {t_name_ko} {t_name_en} {' '.join(c_names)}"
    bm25_corpus.append(bigram_tokenizer(bm25_text))

    db_ids.append(t['db_id'])

# 인덱스 빌드
bm25 = BM25Okapi(bm25_corpus)
print("📦 벡터 임베딩 생성 중 (시간이 조금 걸립니다)...")
corpus_embeddings = vector_model.encode(vector_corpus, convert_to_tensor=True, show_progress_bar=True).to('cpu')

print(f"✅ 인덱싱 완료! (총 {len(db_ids)}개)")

# =============================================================================
# [4] 평가 실행
# =============================================================================
with open(VALID_PATH, 'r', encoding='utf-8') as f:
    valid_data = json.load(f)[:TEST_LIMIT]

stats = {"total": 0, "recall_union": 0, "top1": 0, "top5": 0}

print(f"\n🚀 검색 및 재순위화 평가 시작...")
for item in tqdm(valid_data):
    question = item['question']
    gold_id = item['db_id']
    stats["total"] += 1

    # --- [Step 1] Candidate Generation ---
    candidates = set()

    # A. BM25 (Top 50)
    q_tokens = bigram_tokenizer(question)
    if q_tokens:
        bm25_top = bm25.get_top_n(q_tokens, db_ids, n=50)
        candidates.update(bm25_top)

    # B. Vector (Top 50) - 학습된 모델 사용
    q_emb = vector_model.encode(question, convert_to_tensor=True).to('cpu')
    cos_scores = util.cos_sim(q_emb, corpus_embeddings)[0]
    top_vec = torch.topk(cos_scores, k=50)
    vec_indices = top_vec.indices.tolist()
    candidates.update([db_ids[i] for i in vec_indices])

    candidate_list = list(candidates)

    # Recall 측정
    if gold_id in candidate_list:
        stats["recall_union"] += 1
    else:
        continue # 후보군에 없으면 Reranking 의미 없음

    # --- [Step 2] Reranking ---
    rerank_scores = []
    with torch.no_grad():
        for tid in candidate_list:
            t_info = all_tables[tid]
            # Reranker 입력도 보강된 텍스트를 쓰면 더 좋음
            schema_input = make_schema_text(t_info)

            inputs = tokenizer(question, schema_input, return_tensors="pt", truncation=True, max_length=512).to(device)
            score = reranker_model(**inputs).logits[0][1].item()
            rerank_scores.append((tid, score))

    rerank_scores.sort(key=lambda x: x[1], reverse=True)
    ranked_ids = [s[0] for s in rerank_scores]

    if gold_id == ranked_ids[0]: stats["top1"] += 1
    if gold_id in ranked_ids[:5]: stats["top5"] += 1

# 결과 출력
t = stats["total"]
print(f"\n🏆 최종 평가 결과 (Tuned-Retriever + Schema Augmented)")
print(f"📡 Recall(Union): {stats['recall_union']/t*100:.2f}% (목표: 80% 이상)")
print(f"🥇 Top-1 Acc: {stats['top1']/t*100:.2f}%")
print(f"🖐️ Top-5 Acc: {stats['top5']/t*100:.2f}%")


🚀 [New Hybrid] Tuned-SBERT + BM25 -> Neural Reranker
✅ 학습된 Retriever 모델 로드: /content/drive/MyDrive/P02_SemanticParsing/saved_models_final/retriever_finetuned_v2


The tokenizer you are loading from '/content/drive/MyDrive/P02_SemanticParsing/saved_models_final/retriever_finetuned_v2' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


📂 인덱싱 데이터 구축 중...


Indexing:   0%|          | 0/5761 [00:00<?, ?it/s]

📦 벡터 임베딩 생성 중 (시간이 조금 걸립니다)...


Batches:   0%|          | 0/181 [00:00<?, ?it/s]

✅ 인덱싱 완료! (총 5761개)

🚀 검색 및 재순위화 평가 시작...


  0%|          | 0/500 [00:00<?, ?it/s]


🏆 최종 평가 결과 (Tuned-Retriever + Schema Augmented)
📡 Recall(Union): 69.00% (목표: 80% 이상)
🥇 Top-1 Acc: 19.00%
🖐️ Top-5 Acc: 30.60%


In [None]:
import json
import torch
import random
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from tqdm.auto import tqdm

# =============================================================================
# [1] 설정
# =============================================================================
WORK_DIR = "/content/drive/MyDrive/P02_SemanticParsing"
BASE_DIR = f"{WORK_DIR}/nia"
TRAIN_PATH = f"{BASE_DIR}/train.json"
TABLE_PATH = f"{BASE_DIR}/tables.json"
# 방금 학습한 모델을 로드해서 '선생님'으로 씀
PREV_MODEL_PATH = f"{WORK_DIR}/saved_models_final/retriever_finetuned_v2"
OUTPUT_PATH = f"{WORK_DIR}/saved_models_final/retriever_hard_neg_v3"

BATCH_SIZE = 16 # Triplet은 메모리를 더 먹으므로 배치 줄임
NUM_EPOCHS = 3

# =============================================================================
# [2] 데이터 로드 및 스키마 텍스트 준비
# =============================================================================
print("📂 데이터 로드 및 임베딩 준비...")
with open(TABLE_PATH, 'r', encoding='utf-8') as f:
    table_list = json.load(f)

# 스키마 텍스트 생성 (이전과 동일)
def make_schema_text(t):
    t_name_ko = t.get('table_names_original', "")
    if isinstance(t_name_ko, list): t_name_ko = t_name_ko[0]
    t_name_en = t.get('table_names', "")
    if isinstance(t_name_en, list): t_name_en = t_name_en[0]
    c_names = [c[1] for c in t.get('column_names', [])]
    return f"테이블: {t_name_ko} ({t_name_en}) | 컬럼: {', '.join(c_names)}"

all_schemas = [make_schema_text(t) for t in table_list]
all_db_ids = [t['db_id'] for t in table_list]
db_id_to_idx = {db_id: idx for idx, db_id in enumerate(all_db_ids)}

# =============================================================================
# [3] Hard Negative Mining (핵심!)
# =============================================================================
# 현재 모델로 전체 테이블을 인코딩합니다.
model = SentenceTransformer(PREV_MODEL_PATH)
corpus_embeddings = model.encode(all_schemas, convert_to_tensor=True, show_progress_bar=True)

print("⛏️ Hard Negative 채굴 시작...")
train_examples = []

with open(TRAIN_PATH, 'r', encoding='utf-8') as f:
    train_data = json.load(f)

for item in tqdm(train_data):
    question = item['question']
    gold_id = item['db_id']

    if gold_id not in db_id_to_idx: continue

    # 1. 질문 인코딩
    q_emb = model.encode(question, convert_to_tensor=True)

    # 2. 가장 유사한 Top-10 검색
    # (질문과 비슷한데 정답이 아닌 애들을 찾기 위함)
    cos_scores = util.cos_sim(q_emb, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=10)

    gold_idx = db_id_to_idx[gold_id]
    positive_text = all_schemas[gold_idx]

    # 3. Negative 선정 (Top-10 안에 들었지만 정답이 아닌 것)
    neg_texts = []
    for score, idx in zip(top_results.values, top_results.indices):
        idx = idx.item()
        if idx != gold_idx: # 정답이 아니면 Negative
            neg_texts.append(all_schemas[idx])

    # 4. 데이터셋 추가 (Triplet: Anchor, Positive, Negative)
    # Hard Negative가 있으면 그것을, 없으면 랜덤으로 뽑지 않고 스킵하거나 다음으로 넘어감
    if len(neg_texts) > 0:
        # 가장 헷갈리는 놈(score가 가장 높은 오답) 1개 또는 2개 선택
        hard_negative = neg_texts[0]
        train_examples.append(InputExample(texts=[question, positive_text, hard_negative]))

print(f"✅ 학습 데이터 생성 완료: {len(train_examples)}건 (Hard Negatives 포함)")

# =============================================================================
# [4] 모델 재학습 (Triplet Loss)
# =============================================================================
# Hard Negative 학습에는 TripletLoss가 효과적입니다.
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)
train_loss = losses.MultipleNegativesRankingLoss(model)
# Tip: MNRL은 (A, P, N) 포맷 데이터가 들어오면 자동으로 Triplet으로 학습합니다.

print(f"🔥 Hard Negative 학습 시작...")
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=NUM_EPOCHS,
    warmup_steps=100,
    show_progress_bar=True,
    output_path=OUTPUT_PATH
)

print(f"✅ 최종 모델 저장: {OUTPUT_PATH}")

📂 데이터 로드 및 임베딩 준비...


The tokenizer you are loading from '/content/drive/MyDrive/P02_SemanticParsing/saved_models_final/retriever_finetuned_v2' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


Batches:   0%|          | 0/181 [00:00<?, ?it/s]

⛏️ Hard Negative 채굴 시작...


  0%|          | 0/88946 [00:00<?, ?it/s]

✅ 학습 데이터 생성 완료: 88946건 (Hard Negatives 포함)
🔥 Hard Negative 학습 시작...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.8868
1000,0.8106
1500,0.7912
2000,0.7793
2500,0.7526
3000,0.7493
3500,0.7417
4000,0.7159
4500,0.6963
5000,0.6751


Step,Training Loss
500,0.8868
1000,0.8106
1500,0.7912
2000,0.7793
2500,0.7526
3000,0.7493
3500,0.7417
4000,0.7159
4500,0.6963
5000,0.6751


✅ 최종 모델 저장: /content/drive/MyDrive/P02_SemanticParsing/saved_models_final/retriever_hard_neg_v3


In [None]:
!pip install rank_bm25
import json
import torch
import os
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi

# =============================================================================
# [1] 설정
# =============================================================================
WORK_DIR = "/content/drive/MyDrive/P02_SemanticParsing"
BASE_DIR = f"{WORK_DIR}/nia"
# ★ 중요: 방금 학습시킨 모델 경로 지정 평가 코드에서 이 부분만 수정
RETRIEVER_MODEL_PATH = f"{WORK_DIR}/saved_models_final/retriever_hard_neg_v3"
# (방금 학습 중인 모델 경로)
RERANKER_MODEL_PATH = f"{WORK_DIR}/saved_models_final/epoch_5" # 기존 Reranker 유지
VALID_PATH = f"{BASE_DIR}/valid.json"
TABLE_PATH = f"{BASE_DIR}/tables.json"

TEST_LIMIT = 500 # 전체 평가 시 None 또는 len(valid_data)
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"🚀 [New Hybrid] Tuned-SBERT + BM25 -> Neural Reranker")

# =============================================================================
# [2] 모델 로드
# =============================================================================
# 1. Fine-tuned Retriever 로드
if os.path.exists(RETRIEVER_MODEL_PATH):
    print(f"✅ 학습된 Retriever 모델 로드: {RETRIEVER_MODEL_PATH}")
    vector_model = SentenceTransformer(RETRIEVER_MODEL_PATH)
else:
    print("⚠️ 학습된 모델이 없습니다. Base 모델을 사용합니다.")
    vector_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# 2. Reranker 로드 (기존 동일)
tokenizer = AutoTokenizer.from_pretrained(RERANKER_MODEL_PATH)
reranker_model = AutoModelForSequenceClassification.from_pretrained(RERANKER_MODEL_PATH).to(device).eval()

# =============================================================================
# [3] 데이터 로드 및 인덱싱 (스키마 보강 적용)
# =============================================================================
with open(TABLE_PATH, 'r', encoding='utf-8') as f:
    table_list = json.load(f)
    all_tables = {t['db_id']: t for t in table_list}

# 스키마 텍스트 생성 함수 (학습 코드와 동일하게 유지)
def make_schema_text(t):
    t_name_ko = t.get('table_names_original', "")
    if isinstance(t_name_ko, list): t_name_ko = t_name_ko[0]
    t_name_en = t.get('table_names', "")
    if isinstance(t_name_en, list): t_name_en = t_name_en[0]
    c_names = [c[1] for c in t.get('column_names', [])]
    c_text = ", ".join(c_names)
    # 검색용 텍스트 (Document)
    return f"테이블: {t_name_ko} ({t_name_en}) | 컬럼: {c_text}"

print("📂 인덱싱 데이터 구축 중...")
bm25_corpus = []
vector_corpus = []
db_ids = []

def bigram_tokenizer(text):
    text = str(text).replace(" ", "")
    return [text[i:i+2] for i in range(len(text)-1)]

for t in tqdm(table_list, desc="Indexing"):
    # 1. Vector용 텍스트 (보강된 스키마 사용)
    rich_text = make_schema_text(t)
    vector_corpus.append(rich_text)

    # 2. BM25용 텍스트 (키워드 반복 강화)
    # 한글명, 영문명, 컬럼명을 공백으로 나열
    t_name_ko = t.get('table_names_original', "")
    if isinstance(t_name_ko, list): t_name_ko = t_name_ko[0]
    t_name_en = t.get('table_names', "")
    if isinstance(t_name_en, list): t_name_en = t_name_en[0]
    c_names = [c[1] for c in t.get('column_names', [])]

    bm25_text = f"{t_name_ko} {t_name_ko} {t_name_en} {' '.join(c_names)}"
    bm25_corpus.append(bigram_tokenizer(bm25_text))

    db_ids.append(t['db_id'])

# 인덱스 빌드
bm25 = BM25Okapi(bm25_corpus)
print("📦 벡터 임베딩 생성 중 (시간이 조금 걸립니다)...")
corpus_embeddings = vector_model.encode(vector_corpus, convert_to_tensor=True, show_progress_bar=True).to('cpu')

print(f"✅ 인덱싱 완료! (총 {len(db_ids)}개)")

# =============================================================================
# [4] 평가 실행
# =============================================================================
with open(VALID_PATH, 'r', encoding='utf-8') as f:
    valid_data = json.load(f)[:TEST_LIMIT]

stats = {"total": 0, "recall_union": 0, "top1": 0, "top5": 0}

print(f"\n🚀 검색 및 재순위화 평가 시작...")
for item in tqdm(valid_data):
    question = item['question']
    gold_id = item['db_id']
    stats["total"] += 1

    # --- [Step 1] Candidate Generation ---
    candidates = set()

    # A. BM25 (Top 50)
    q_tokens = bigram_tokenizer(question)
    if q_tokens:
        bm25_top = bm25.get_top_n(q_tokens, db_ids, n=50)
        candidates.update(bm25_top)

    # B. Vector (Top 50) - 학습된 모델 사용
    q_emb = vector_model.encode(question, convert_to_tensor=True).to('cpu')
    cos_scores = util.cos_sim(q_emb, corpus_embeddings)[0]
    top_vec = torch.topk(cos_scores, k=50)
    vec_indices = top_vec.indices.tolist()
    candidates.update([db_ids[i] for i in vec_indices])

    candidate_list = list(candidates)

    # Recall 측정
    if gold_id in candidate_list:
        stats["recall_union"] += 1
    else:
        continue # 후보군에 없으면 Reranking 의미 없음

    # --- [Step 2] Reranking ---
    rerank_scores = []
    with torch.no_grad():
        for tid in candidate_list:
            t_info = all_tables[tid]
            # Reranker 입력도 보강된 텍스트를 쓰면 더 좋음
            schema_input = make_schema_text(t_info)

            inputs = tokenizer(question, schema_input, return_tensors="pt", truncation=True, max_length=512).to(device)
            score = reranker_model(**inputs).logits[0][1].item()
            rerank_scores.append((tid, score))

    rerank_scores.sort(key=lambda x: x[1], reverse=True)
    ranked_ids = [s[0] for s in rerank_scores]

    if gold_id == ranked_ids[0]: stats["top1"] += 1
    if gold_id in ranked_ids[:5]: stats["top5"] += 1

# 결과 출력
t = stats["total"]
print(f"\n🏆 최종 평가 결과 (Tuned-Retriever + Schema Augmented)")
print(f"📡 Recall(Union): {stats['recall_union']/t*100:.2f}% (목표: 80% 이상)")
print(f"🥇 Top-1 Acc: {stats['top1']/t*100:.2f}%")
print(f"🖐️ Top-5 Acc: {stats['top5']/t*100:.2f}%")


🚀 [New Hybrid] Tuned-SBERT + BM25 -> Neural Reranker
✅ 학습된 Retriever 모델 로드: /content/drive/MyDrive/P02_SemanticParsing/saved_models_final/retriever_hard_neg_v3


The tokenizer you are loading from '/content/drive/MyDrive/P02_SemanticParsing/saved_models_final/retriever_hard_neg_v3' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


📂 인덱싱 데이터 구축 중...


Indexing:   0%|          | 0/5761 [00:00<?, ?it/s]

📦 벡터 임베딩 생성 중 (시간이 조금 걸립니다)...


Batches:   0%|          | 0/181 [00:00<?, ?it/s]

✅ 인덱싱 완료! (총 5761개)

🚀 검색 및 재순위화 평가 시작...


  0%|          | 0/500 [00:00<?, ?it/s]


🏆 최종 평가 결과 (Tuned-Retriever + Schema Augmented)
📡 Recall(Union): 65.00% (목표: 80% 이상)
🥇 Top-1 Acc: 19.00%
🖐️ Top-5 Acc: 29.80%


In [None]:
import os
import json
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.auto import tqdm
from torch.cuda.amp import autocast, GradScaler

# =============================================================================
# [1] 설정
# =============================================================================
WORK_DIR = "/content/drive/MyDrive/P02_SemanticParsing"
BASE_DIR = f"{WORK_DIR}/nia"
TRAIN_PATH = f"{BASE_DIR}/train.json"
TABLE_PATH = f"{BASE_DIR}/tables.json"
SAVE_DIR = f"{WORK_DIR}/saved_models_final/reranker_tuned_v2" # 경로 변경

MODEL_NAME = "monologg/koelectra-base-v3-discriminator"
BATCH_SIZE = 16          # 메모리 터지면 8로 줄이세요
ACCUMULATION_STEPS = 4   # 배치 8일 경우 8로 늘리세요
EPOCHS = 3               # 3 epoch면 충분합니다
LR = 3e-5
NEG_RATIO = 3            # 정답 1개당 오답 3개 학습

# =============================================================================
# [2] 데이터셋 (Retriever와 포맷 통일!)
# =============================================================================
class KoreanSchemaDataset(Dataset):
    def __init__(self, data_path, table_path, tokenizer, neg_ratio=3):
        self.tokenizer = tokenizer
        self.neg_ratio = neg_ratio

        with open(data_path, 'r', encoding='utf-8') as f:
            self.data = json.load(f)
        with open(table_path, 'r', encoding='utf-8') as f:
            table_list = json.load(f)
            self.tables = {t['db_id']: t for t in table_list}

        self.samples = []
        self._create_samples()
        print(f"📌 학습 데이터 생성 완료: {len(self.samples)}건")

    def _make_schema_text(self, t):
        # ★★★ 여기가 핵심 수정 사항입니다 ★★★
        # Retriever 학습 때와 토씨 하나 안 틀리고 똑같은 포맷이어야 합니다.

        # 1. 테이블명
        t_name_ko = t.get('table_names_original', "")
        if isinstance(t_name_ko, list): t_name_ko = t_name_ko[0]
        t_name_en = t.get('table_names', "")
        if isinstance(t_name_en, list): t_name_en = t_name_en[0]

        # 2. 컬럼명
        c_names = [c[1] for c in t.get('column_names', [])]
        c_text = ", ".join(c_names)

        # 3. 포맷 조합 (테이블: ... | 컬럼: ...)
        return f"테이블: {t_name_ko} ({t_name_en}) | 컬럼: {c_text}"

    def _create_samples(self):
        table_ids = list(self.tables.keys())
        for item in tqdm(self.data, desc="Building Dataset"):
            q = item['question']
            gold_id = item['db_id']

            if gold_id in self.tables:
                # Positive Sample
                self._add_sample(q, gold_id, 1)

                # Negative Sampling
                negs = random.sample([t for t in table_ids if t != gold_id], self.neg_ratio)
                for nid in negs:
                    self._add_sample(q, nid, 0)

    def _add_sample(self, q, tid, label):
        t_info = self.tables[tid]
        schema_text = self._make_schema_text(t_info) # 수정된 함수 사용

        tokenized = self.tokenizer(
            q, schema_text,
            truncation=True, max_length=512, padding=False,
            return_token_type_ids=True # KoELECTRA는 이게 중요함
        )

        self.samples.append({
            'input_ids': tokenized['input_ids'],
            'token_type_ids': tokenized['token_type_ids'], # Segment ID
            'attention_mask': tokenized['attention_mask'],
            'label': label
        })

    def __len__(self): return len(self.samples)
    def __getitem__(self, idx): return self.samples[idx]

# =============================================================================
# [3] 학습 루프 (Gradient Accumulation)
# =============================================================================
def collate_fn(batch):
    tokenizer.pad_token_id = 0
    max_len = max([len(b['input_ids']) for b in batch])

    input_ids = []
    token_type_ids = []
    attention_mask = []
    labels = []

    for b in batch:
        # Dynamic Padding
        pad_len = max_len - len(b['input_ids'])
        input_ids.append(b['input_ids'] + [0]*pad_len)
        token_type_ids.append(b['token_type_ids'] + [0]*pad_len)
        attention_mask.append(b['attention_mask'] + [0]*pad_len)
        labels.append(b['label'])

    return {
        'input_ids': torch.tensor(input_ids),
        'token_type_ids': torch.tensor(token_type_ids),
        'attention_mask': torch.tensor(attention_mask),
        'labels': torch.tensor(labels)
    }

if __name__ == '__main__':
    os.makedirs(SAVE_DIR, exist_ok=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"🔥 Reranker 재학습 시작 (GPU: {torch.cuda.get_device_name(0)})")

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    dataset = KoreanSchemaDataset(TRAIN_PATH, TABLE_PATH, tokenizer, neg_ratio=NEG_RATIO)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)
    optimizer = AdamW(model.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()
    scaler = GradScaler()

    model.train()
    for epoch in range(EPOCHS):
        total_loss = 0
        optimizer.zero_grad()

        loop = tqdm(loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
        for step, batch in enumerate(loop):
            input_ids = batch['input_ids'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            with autocast():
                outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)
                loss = loss / ACCUMULATION_STEPS

            scaler.scale(loss).backward()

            if (step + 1) % ACCUMULATION_STEPS == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            total_loss += loss.item() * ACCUMULATION_STEPS
            loop.set_postfix(loss=f"{loss.item() * ACCUMULATION_STEPS:.4f}")

        print(f"✨ Epoch {epoch+1} Avg Loss: {total_loss / len(loader):.4f}")

        # 모델 저장
        model.save_pretrained(f"{SAVE_DIR}/epoch_{epoch+1}")
        tokenizer.save_pretrained(f"{SAVE_DIR}/epoch_{epoch+1}")

    print("🎉 Reranker 재학습 완료! 이제 평가를 다시 돌려보세요.")

🔥 Reranker 재학습 시작 (GPU: NVIDIA A100-SXM4-40GB)


tokenizer_config.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Building Dataset:   0%|          | 0/88946 [00:00<?, ?it/s]

📌 학습 데이터 생성 완료: 355784건


pytorch_model.bin:   0%|          | 0.00/452M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


Epoch 1/3:   0%|          | 0/22237 [00:00<?, ?it/s]

  with autocast():


model.safetensors:   0%|          | 0.00/452M [00:00<?, ?B/s]

✨ Epoch 1 Avg Loss: 0.0724


Epoch 2/3:   0%|          | 0/22237 [00:00<?, ?it/s]

✨ Epoch 2 Avg Loss: 0.0505


Epoch 3/3:   0%|          | 0/22237 [00:00<?, ?it/s]

✨ Epoch 3 Avg Loss: 0.0446
🎉 Reranker 재학습 완료! 이제 평가를 다시 돌려보세요.


In [None]:
!pip install rank_bm25
import json
import torch
import os
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi

# =============================================================================
# [1] 설정
# =============================================================================
WORK_DIR = "/content/drive/MyDrive/P02_SemanticParsing"
BASE_DIR = f"{WORK_DIR}/nia"
# ★ 중요: 방금 학습시킨 모델 경로 지정
RETRIEVER_MODEL_PATH = f"{WORK_DIR}/saved_models_final/retriever_finetuned_v2"
RERANKER_MODEL_PATH = f"{WORK_DIR}/saved_models_final/reranker_tuned_v2/epoch_3"
VALID_PATH = f"{BASE_DIR}/valid.json"
TABLE_PATH = f"{BASE_DIR}/tables.json"

TEST_LIMIT = 500 # 전체 평가 시 None 또는 len(valid_data)
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"🚀 [New Hybrid] Tuned-SBERT + BM25 -> Neural Reranker")

# =============================================================================
# [2] 모델 로드
# =============================================================================
# 1. Fine-tuned Retriever 로드
if os.path.exists(RETRIEVER_MODEL_PATH):
    print(f"✅ 학습된 Retriever 모델 로드: {RETRIEVER_MODEL_PATH}")
    vector_model = SentenceTransformer(RETRIEVER_MODEL_PATH)
else:
    print("⚠️ 학습된 모델이 없습니다. Base 모델을 사용합니다.")
    vector_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# 2. Reranker 로드 (기존 동일)
tokenizer = AutoTokenizer.from_pretrained(RERANKER_MODEL_PATH)
reranker_model = AutoModelForSequenceClassification.from_pretrained(RERANKER_MODEL_PATH).to(device).eval()

# =============================================================================
# [3] 데이터 로드 및 인덱싱 (스키마 보강 적용)
# =============================================================================
with open(TABLE_PATH, 'r', encoding='utf-8') as f:
    table_list = json.load(f)
    all_tables = {t['db_id']: t for t in table_list}

# 스키마 텍스트 생성 함수 (학습 코드와 동일하게 유지)
def make_schema_text(t):
    t_name_ko = t.get('table_names_original', "")
    if isinstance(t_name_ko, list): t_name_ko = t_name_ko[0]
    t_name_en = t.get('table_names', "")
    if isinstance(t_name_en, list): t_name_en = t_name_en[0]
    c_names = [c[1] for c in t.get('column_names', [])]
    c_text = ", ".join(c_names)
    # 검색용 텍스트 (Document)
    return f"테이블: {t_name_ko} ({t_name_en}) | 컬럼: {c_text}"

print("📂 인덱싱 데이터 구축 중...")
bm25_corpus = []
vector_corpus = []
db_ids = []

def bigram_tokenizer(text):
    text = str(text).replace(" ", "")
    return [text[i:i+2] for i in range(len(text)-1)]

for t in tqdm(table_list, desc="Indexing"):
    # 1. Vector용 텍스트 (보강된 스키마 사용)
    rich_text = make_schema_text(t)
    vector_corpus.append(rich_text)

    # 2. BM25용 텍스트 (키워드 반복 강화)
    # 한글명, 영문명, 컬럼명을 공백으로 나열
    t_name_ko = t.get('table_names_original', "")
    if isinstance(t_name_ko, list): t_name_ko = t_name_ko[0]
    t_name_en = t.get('table_names', "")
    if isinstance(t_name_en, list): t_name_en = t_name_en[0]
    c_names = [c[1] for c in t.get('column_names', [])]

    bm25_text = f"{t_name_ko} {t_name_ko} {t_name_en} {' '.join(c_names)}"
    bm25_corpus.append(bigram_tokenizer(bm25_text))

    db_ids.append(t['db_id'])

# 인덱스 빌드
bm25 = BM25Okapi(bm25_corpus)
print("📦 벡터 임베딩 생성 중 (시간이 조금 걸립니다)...")
corpus_embeddings = vector_model.encode(vector_corpus, convert_to_tensor=True, show_progress_bar=True).to('cpu')

print(f"✅ 인덱싱 완료! (총 {len(db_ids)}개)")

# =============================================================================
# [4] 평가 실행
# =============================================================================
with open(VALID_PATH, 'r', encoding='utf-8') as f:
    valid_data = json.load(f)[:TEST_LIMIT]

stats = {"total": 0, "recall_union": 0, "top1": 0, "top5": 0}

print(f"\n🚀 검색 및 재순위화 평가 시작...")
for item in tqdm(valid_data):
    question = item['question']
    gold_id = item['db_id']
    stats["total"] += 1

    # --- [Step 1] Candidate Generation ---
    candidates = set()

    # A. BM25 (Top 50)
    q_tokens = bigram_tokenizer(question)
    if q_tokens:
        bm25_top = bm25.get_top_n(q_tokens, db_ids, n=50)
        candidates.update(bm25_top)

    # B. Vector (Top 50) - 학습된 모델 사용
    q_emb = vector_model.encode(question, convert_to_tensor=True).to('cpu')
    cos_scores = util.cos_sim(q_emb, corpus_embeddings)[0]
    top_vec = torch.topk(cos_scores, k=50)
    vec_indices = top_vec.indices.tolist()
    candidates.update([db_ids[i] for i in vec_indices])

    candidate_list = list(candidates)

    # Recall 측정
    if gold_id in candidate_list:
        stats["recall_union"] += 1
    else:
        continue # 후보군에 없으면 Reranking 의미 없음

    # --- [Step 2] Reranking ---
    rerank_scores = []
    with torch.no_grad():
        for tid in candidate_list:
            t_info = all_tables[tid]
            # Reranker 입력도 보강된 텍스트를 쓰면 더 좋음
            schema_input = make_schema_text(t_info)

            inputs = tokenizer(question, schema_input, return_tensors="pt", truncation=True, max_length=512).to(device)
            score = reranker_model(**inputs).logits[0][1].item()
            rerank_scores.append((tid, score))

    rerank_scores.sort(key=lambda x: x[1], reverse=True)
    ranked_ids = [s[0] for s in rerank_scores]

    if gold_id == ranked_ids[0]: stats["top1"] += 1
    if gold_id in ranked_ids[:5]: stats["top5"] += 1

# 결과 출력
t = stats["total"]
print(f"\n🏆 최종 평가 결과 (Tuned-Retriever + Schema Augmented)")
print(f"📡 Recall(Union): {stats['recall_union']/t*100:.2f}% (목표: 80% 이상)")
print(f"🥇 Top-1 Acc: {stats['top1']/t*100:.2f}%")
print(f"🖐️ Top-5 Acc: {stats['top5']/t*100:.2f}%")


🚀 [New Hybrid] Tuned-SBERT + BM25 -> Neural Reranker
✅ 학습된 Retriever 모델 로드: /content/drive/MyDrive/P02_SemanticParsing/saved_models_final/retriever_finetuned_v2


The tokenizer you are loading from '/content/drive/MyDrive/P02_SemanticParsing/saved_models_final/retriever_finetuned_v2' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


📂 인덱싱 데이터 구축 중...


Indexing:   0%|          | 0/5761 [00:00<?, ?it/s]

📦 벡터 임베딩 생성 중 (시간이 조금 걸립니다)...


Batches:   0%|          | 0/181 [00:00<?, ?it/s]

✅ 인덱싱 완료! (총 5761개)

🚀 검색 및 재순위화 평가 시작...


  0%|          | 0/500 [00:00<?, ?it/s]


🏆 최종 평가 결과 (Tuned-Retriever + Schema Augmented)
📡 Recall(Union): 69.00% (목표: 80% 이상)
🥇 Top-1 Acc: 21.20%
🖐️ Top-5 Acc: 31.40%


In [None]:
import torch
import json
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi
from tqdm.auto import tqdm

# =============================================================================
# [1] 경로 설정 (여기를 정확히 확인하세요!)
# =============================================================================
WORK_DIR = "/content/drive/MyDrive/P02_SemanticParsing"

# ★★★ 여기가 핵심입니다. 방금 학습시킨 폴더명과 일치해야 합니다 ★★★
# 만약 epoch_3 폴더가 없다면 그 상위 폴더를 지정해보세요.
RERANKER_PATH = f"{WORK_DIR}/saved_models_final/reranker_tuned_v2/epoch_3"

RETRIEVER_PATH = f"{WORK_DIR}/saved_models_final/retriever_finetuned_v2"
DATA_DIR = f"{WORK_DIR}/nia"

# =============================================================================
# [2] 디버깅 함수 (입력 포맷 확인용)
# =============================================================================
def make_schema_text(t):
    # Reranker 학습 때와 100% 동일해야 함
    t_name_ko = t.get('table_names_original', "")
    if isinstance(t_name_ko, list): t_name_ko = t_name_ko[0]
    t_name_en = t.get('table_names', "")
    if isinstance(t_name_en, list): t_name_en = t_name_en[0]
    c_names = [c[1] for c in t.get('column_names', [])]

    return f"테이블: {t_name_ko} ({t_name_en}) | 컬럼: {', '.join(c_names)}"

# =============================================================================
# [3] 모델 로드 및 검증
# =============================================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🔥 디버깅 시작 (Device: {device})")

# 1. Reranker 로드 확인
if os.path.exists(RERANKER_PATH):
    print(f"✅ Reranker 경로 확인됨: {RERANKER_PATH}")
    reranker_model = AutoModelForSequenceClassification.from_pretrained(RERANKER_PATH).to(device).eval()
    tokenizer = AutoTokenizer.from_pretrained(RERANKER_PATH)
else:
    print(f"🚨 [치명적 오류] Reranker 경로가 없습니다: {RERANKER_PATH}")
    print("   -> 경로를 다시 확인하거나, 학습이 제대로 저장되었는지 확인하세요.")
    # 강제 종료 (잘못된 평가 방지)
    raise FileNotFoundError("Reranker Model Not Found")

# 2. Retriever 로드
vector_model = SentenceTransformer(RETRIEVER_PATH)

# =============================================================================
# [4] 데이터 로드
# =============================================================================
with open(f"{DATA_DIR}/tables.json", 'r') as f:
    tables = {t['db_id']: t for t in json.load(f)}

# 인덱싱 (간소화)
print("📂 인덱싱 중...")
db_ids = list(tables.keys())
corpus_embeddings = vector_model.encode(
    [make_schema_text(tables[tid]) for tid in db_ids],
    convert_to_tensor=True, show_progress_bar=True
).to('cpu')

# =============================================================================
# [5] 실제 점수 찍어보기 (Sanity Check)
# =============================================================================
print("\n🔎 [Sanity Check] 모델이 정답을 알아보는지 테스트합니다.")

# 검증 데이터 1개만 뽑아서 점수 확인
with open(f"{DATA_DIR}/valid.json", 'r') as f:
    sample = json.load(f)[0] # 첫 번째 질문

q = sample['question']
gold = sample['db_id']
gold_schema = make_schema_text(tables[gold])
neg_id = [k for k in tables.keys() if k != gold][0] # 아무 오답
neg_schema = make_schema_text(tables[neg_id])

print(f"질문: {q}")
print(f"정답 테이블: {gold}")

# 점수 계산
with torch.no_grad():
    # 정답 점수
    inputs_pos = tokenizer(q, gold_schema, return_tensors='pt', truncation=True, max_length=512).to(device)
    score_pos = reranker_model(**inputs_pos).logits[0][1].item()

    # 오답 점수
    inputs_neg = tokenizer(q, neg_schema, return_tensors='pt', truncation=True, max_length=512).to(device)
    score_neg = reranker_model(**inputs_neg).logits[0][1].item()

print(f"✅ 정답 점수: {score_pos:.4f}")
print(f"❌ 오답 점수: {score_neg:.4f}")

if score_pos > score_neg:
    print("🎉 모델이 정상적으로 정답을 더 좋아합니다! (Evaluation 진행하세요)")
    # 여기에 평가 루프를 돌리시면 됩니다.
else:
    print("💀 모델이 멍청합니다. (정답 < 오답) -> 학습이 잘못되었거나 로드가 잘못됨.")

🔥 디버깅 시작 (Device: cuda)
✅ Reranker 경로 확인됨: /content/drive/MyDrive/P02_SemanticParsing/saved_models_final/reranker_tuned_v2/epoch_3


The tokenizer you are loading from '/content/drive/MyDrive/P02_SemanticParsing/saved_models_final/retriever_finetuned_v2' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


📂 인덱싱 중...


Batches:   0%|          | 0/181 [00:00<?, ?it/s]


🔎 [Sanity Check] 모델이 정답을 알아보는지 테스트합니다.
질문: 3층메디칼약국의 주소를 알려줘
정답 테이블: seouldata_healthcare_733
✅ 정답 점수: 2.4363
❌ 오답 점수: -5.7676
🎉 모델이 정상적으로 정답을 더 좋아합니다! (Evaluation 진행하세요)


In [None]:
import json
import torch
import os
import numpy as np
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi

# =============================================================================
# [1] 설정 & 경로 (검증된 경로)
# =============================================================================
WORK_DIR = "/content/drive/MyDrive/P02_SemanticParsing"
BASE_DIR = f"{WORK_DIR}/nia"

# ★ Sanity Check에서 검증된 경로 그대로 사용
RETRIEVER_PATH = f"{WORK_DIR}/saved_models_final/retriever_finetuned_v2"
RERANKER_PATH = f"{WORK_DIR}/saved_models_final/reranker_tuned_v2/epoch_3"

VALID_PATH = f"{BASE_DIR}/valid.json"
TABLE_PATH = f"{BASE_DIR}/tables.json"
TEST_LIMIT = 500  # 전체 평가
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"🔥 최종 평가 시작 (Device: {device})")

# =============================================================================
# [2] 핵심 함수: 스키마 텍스트 생성 (학습 때와 100% 동일해야 함)
# =============================================================================
def make_schema_text(t):
    t_name_ko = t.get('table_names_original', "")
    if isinstance(t_name_ko, list): t_name_ko = t_name_ko[0]
    t_name_en = t.get('table_names', "")
    if isinstance(t_name_en, list): t_name_en = t_name_en[0]
    c_names = [c[1] for c in t.get('column_names', [])]

    # Reranker가 학습한 바로 그 포맷!
    return f"테이블: {t_name_ko} ({t_name_en}) | 컬럼: {', '.join(c_names)}"

# BM25용 토크나이저
def bigram_tokenizer(text):
    text = str(text).replace(" ", "")
    return [text[i:i+2] for i in range(len(text)-1)]

# =============================================================================
# [3] 모델 및 데이터 로드
# =============================================================================
print("📂 모델 로딩 중...")
# 1. Retriever
vector_model = SentenceTransformer(RETRIEVER_PATH)

# 2. Reranker
tokenizer = AutoTokenizer.from_pretrained(RERANKER_PATH)
reranker_model = AutoModelForSequenceClassification.from_pretrained(RERANKER_PATH).to(device).eval()

# 3. 데이터
with open(TABLE_PATH, 'r', encoding='utf-8') as f:
    table_list = json.load(f)
    all_tables = {t['db_id']: t for t in table_list}

with open(VALID_PATH, 'r', encoding='utf-8') as f:
    valid_data = json.load(f)[:TEST_LIMIT]

# =============================================================================
# [4] 인덱싱 (Indexing)
# =============================================================================
print("📂 인덱싱 구축 중...")
bm25_corpus = []
vector_corpus = []
db_ids = []

for t in tqdm(table_list, desc="Indexing"):
    # Vector & Reranker용 텍스트 (Rich Text)
    rich_text = make_schema_text(t)
    vector_corpus.append(rich_text)

    # BM25용 텍스트 (키워드 중심)
    t_name_ko = t.get('table_names_original', "")
    if isinstance(t_name_ko, list): t_name_ko = t_name_ko[0]
    c_names = [c[1] for c in t.get('column_names', [])]
    bm25_text = f"{t_name_ko} {t_name_ko} {' '.join(c_names)}"
    bm25_corpus.append(bigram_tokenizer(bm25_text))

    db_ids.append(t['db_id'])

bm25 = BM25Okapi(bm25_corpus)
corpus_embeddings = vector_model.encode(vector_corpus, convert_to_tensor=True, show_progress_bar=True).to('cpu')

print("✅ 준비 완료! 평가 루프 진입...")

# =============================================================================
# [5] 평가 루프 (Retrieval -> Reranking)
# =============================================================================
stats = {"total": 0, "recall_union": 0, "top1": 0, "top5": 0}

for item in tqdm(valid_data, desc="Evaluating"):
    question = item['question']
    gold_id = item['db_id']
    stats["total"] += 1

    # --- Step 1: Candidate Generation (Hybrid) ---
    candidates = set()

    # BM25
    q_tokens = bigram_tokenizer(question)
    if q_tokens:
        candidates.update(bm25.get_top_n(q_tokens, db_ids, n=50))

    # Vector
    q_emb = vector_model.encode(question, convert_to_tensor=True).to('cpu')
    cos_scores = util.cos_sim(q_emb, corpus_embeddings)[0]
    top_vec = torch.topk(cos_scores, k=50)
    candidates.update([db_ids[i] for i in top_vec.indices.tolist()])

    candidate_list = list(candidates)

    # Recall Check
    if gold_id in candidate_list:
        stats["recall_union"] += 1
    else:
        continue # 후보군에 없으면 Reranking 해도 소용없음

    # --- Step 2: Reranking (Neural) ---
    rerank_scores = []

    # 후보군들에 대해 점수 매기기 (Batch 처리 대신 루프로 명확하게)
    with torch.no_grad():
        for tid in candidate_list:
            t_info = all_tables[tid]
            # ★ 여기서 정확한 포맷을 넣어주는 게 승부처 ★
            schema_input = make_schema_text(t_info)

            inputs = tokenizer(question, schema_input, return_tensors="pt", truncation=True, max_length=512).to(device)
            # Logits: [Negative_Score, Positive_Score] -> Positive_Score(idx 1) 사용
            score = reranker_model(**inputs).logits[0][1].item()
            rerank_scores.append((tid, score))

    # 점수 높은 순 정렬
    rerank_scores.sort(key=lambda x: x[1], reverse=True)
    ranked_ids = [s[0] for s in rerank_scores]

    # Accuracy Check
    if gold_id == ranked_ids[0]: stats["top1"] += 1
    if gold_id in ranked_ids[:5]: stats["top5"] += 1

# =============================================================================
# [6] 결과 리포트
# =============================================================================
t = stats["total"]
print("\n" + "="*60)
print(f"🏆 최종 성능 리포트 (Final Score)")
print("-" * 60)
print(f"✅ 1. Retrieval Recall (Union): {stats['recall_union']/t*100:.2f}%")
print(f"   (Retriever가 후보군에 정답을 포함시킨 비율)")
print("-" * 60)
print(f"🥇 2. Final Top-1 Accuracy:    {stats['top1']/t*100:.2f}%")
print(f"   (Reranker가 정확히 정답을 1등으로 뽑은 비율)")
print("-" * 60)
print(f"🖐️ 3. Final Top-5 Accuracy:    {stats['top5']/t*100:.2f}%")
print("="*60)

🔥 최종 평가 시작 (Device: cuda)
📂 모델 로딩 중...


The tokenizer you are loading from '/content/drive/MyDrive/P02_SemanticParsing/saved_models_final/retriever_finetuned_v2' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


📂 인덱싱 구축 중...


Indexing:   0%|          | 0/5761 [00:00<?, ?it/s]

Batches:   0%|          | 0/181 [00:00<?, ?it/s]

✅ 준비 완료! 평가 루프 진입...


Evaluating:   0%|          | 0/500 [00:00<?, ?it/s]


🏆 최종 성능 리포트 (Final Score)
------------------------------------------------------------
✅ 1. Retrieval Recall (Union): 67.60%
   (Retriever가 후보군에 정답을 포함시킨 비율)
------------------------------------------------------------
🥇 2. Final Top-1 Accuracy:    21.00%
   (Reranker가 정확히 정답을 1등으로 뽑은 비율)
------------------------------------------------------------
🖐️ 3. Final Top-5 Accuracy:    31.60%


In [None]:
import os
import json
import torch
import random
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
from torch.cuda.amp import autocast, GradScaler
import torch.nn as nn

# =============================================================================
# [1] 설정
# =============================================================================
WORK_DIR = "/content/drive/MyDrive/P02_SemanticParsing"
BASE_DIR = f"{WORK_DIR}/nia"
TRAIN_PATH = f"{BASE_DIR}/train.json"
TABLE_PATH = f"{BASE_DIR}/tables.json"
SAVE_DIR = f"{WORK_DIR}/saved_models_final/reranker_hard_v3"

# 학습된 Retriever (채굴용)
RETRIEVER_PATH = f"{WORK_DIR}/saved_models_final/retriever_finetuned_v2"
# Base Reranker (학습용)
RERANKER_MODEL_NAME = "monologg/koelectra-base-v3-discriminator"

BATCH_SIZE = 16
EPOCHS = 3
LR = 2e-5  # 미세 조정이므로 학습률을 조금 낮춤

# =============================================================================
# [2] Hard Negative 채굴 (Mining)
# =============================================================================
print("⛏️ Hard Negative 채굴 시작 (시간이 좀 걸립니다)...")

# 1. 스키마 텍스트 생성 (Retriever용)
def make_schema_text(t):
    t_name_ko = t.get('table_names_original', "")
    if isinstance(t_name_ko, list): t_name_ko = t_name_ko[0]
    t_name_en = t.get('table_names', "")
    if isinstance(t_name_en, list): t_name_en = t_name_en[0]
    c_names = [c[1] for c in t.get('column_names', [])]
    return f"테이블: {t_name_ko} ({t_name_en}) | 컬럼: {', '.join(c_names)}"

with open(TABLE_PATH, 'r') as f:
    table_list = json.load(f)
    all_tables = {t['db_id']: t for t in table_list}
    all_db_ids = [t['db_id'] for t in table_list]
    all_schemas = [make_schema_text(t) for t in table_list]

# 2. Retriever 로드 & 전체 테이블 임베딩
vector_model = SentenceTransformer(RETRIEVER_PATH)
corpus_embeddings = vector_model.encode(all_schemas, convert_to_tensor=True, show_progress_bar=True)

# 3. Train 데이터에 대해 검색 수행 -> Hard Negative 추출
mined_data = []
with open(TRAIN_PATH, 'r') as f:
    train_data = json.load(f)

for item in tqdm(train_data, desc="Mining"):
    question = item['question']
    gold_id = item['db_id']

    # 질문 임베딩
    q_emb = vector_model.encode(question, convert_to_tensor=True)

    # Top-10 검색
    cos_scores = util.cos_sim(q_emb, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=10)

    indices = top_results.indices.tolist()

    # Hard Negative 선정 (검색됐는데 정답이 아닌 녀석들)
    hard_negs = []
    for idx in indices:
        cand_id = all_db_ids[idx]
        if cand_id != gold_id:
            hard_negs.append(cand_id)

    # 데이터셋 구성: (질문, 정답ID, 오답ID_리스트)
    if gold_id in all_tables and len(hard_negs) > 0:
        mined_data.append({
            "q": question,
            "pos": gold_id,
            "negs": hard_negs[:3] # 상위 3개의 강력한 오답만 사용
        })

print(f"✅ 채굴 완료: {len(mined_data)}건의 데이터셋 생성됨.")

# =============================================================================
# [3] Reranker 데이터셋 (Hard Negative 적용)
# =============================================================================
class HardNegativeDataset(Dataset):
    def __init__(self, mined_data, tables, tokenizer):
        self.tokenizer = tokenizer
        self.samples = []
        self.tables = tables

        for item in tqdm(mined_data, desc="Tokenizing"):
            q = item['q']
            pos_id = item['pos']
            neg_ids = item['negs']

            # Positive Sample (Label 1)
            self._add_sample(q, pos_id, 1)

            # Hard Negative Samples (Label 0)
            for nid in neg_ids:
                self._add_sample(q, nid, 0)

    def _make_schema_text(self, t):
        # 학습 포맷 통일
        t_name_ko = t.get('table_names_original', "")
        if isinstance(t_name_ko, list): t_name_ko = t_name_ko[0]
        t_name_en = t.get('table_names', "")
        if isinstance(t_name_en, list): t_name_en = t_name_en[0]
        c_names = [c[1] for c in t.get('column_names', [])]
        return f"테이블: {t_name_ko} ({t_name_en}) | 컬럼: {', '.join(c_names)}"

    def _add_sample(self, q, tid, label):
        schema = self._make_schema_text(self.tables[tid])
        tokenized = self.tokenizer(
            q, schema, truncation=True, max_length=512, padding=False
        )
        self.samples.append({
            'input_ids': tokenized['input_ids'],
            'attention_mask': tokenized['attention_mask'],
            'token_type_ids': tokenized.get('token_type_ids', [0]*len(tokenized['input_ids'])),
            'label': label
        })

    def __len__(self): return len(self.samples)
    def __getitem__(self, idx): return self.samples[idx]

# =============================================================================
# [4] Reranker 재학습
# =============================================================================
tokenizer = AutoTokenizer.from_pretrained(RERANKER_MODEL_NAME)
dataset = HardNegativeDataset(mined_data, all_tables, tokenizer)

def collate_fn(batch):
    max_len = max([len(b['input_ids']) for b in batch])
    input_ids, token_type_ids, attention_mask, labels = [], [], [], []

    for b in batch:
        pad_len = max_len - len(b['input_ids'])
        input_ids.append(b['input_ids'] + [0]*pad_len)
        token_type_ids.append(b['token_type_ids'] + [0]*pad_len)
        attention_mask.append(b['attention_mask'] + [0]*pad_len)
        labels.append(b['label'])

    return {
        'input_ids': torch.tensor(input_ids),
        'token_type_ids': torch.tensor(token_type_ids),
        'attention_mask': torch.tensor(attention_mask),
        'labels': torch.tensor(labels)
    }

loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

model = AutoModelForSequenceClassification.from_pretrained(RERANKER_MODEL_NAME, num_labels=2).to("cuda")
optimizer = AdamW(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()
scaler = GradScaler()

print(f"🔥 Hard Negative Reranker 학습 시작 (총 {len(dataset)} 샘플)")

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    loop = tqdm(loader, desc=f"Epoch {epoch+1}")

    for batch in loop:
        input_ids = batch['input_ids'].to("cuda")
        token_type_ids = batch['token_type_ids'].to("cuda")
        attention_mask = batch['attention_mask'].to("cuda")
        labels = batch['labels'].to("cuda")

        optimizer.zero_grad()
        with autocast():
            outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
        loop.set_postfix(loss=f"{loss.item():.4f}")

    # 저장
    save_path = f"{SAVE_DIR}/epoch_{epoch+1}"
    os.makedirs(save_path, exist_ok=True)
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"✨ Epoch {epoch+1} 저장 완료: {save_path}")

print("🎉 모든 학습 완료! 최종 평가를 다시 돌려보세요.")

⛏️ Hard Negative 채굴 시작 (시간이 좀 걸립니다)...


The tokenizer you are loading from '/content/drive/MyDrive/P02_SemanticParsing/saved_models_final/retriever_finetuned_v2' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


Batches:   0%|          | 0/181 [00:00<?, ?it/s]

Mining:   0%|          | 0/88946 [00:00<?, ?it/s]

✅ 채굴 완료: 88946건의 데이터셋 생성됨.


Tokenizing:   0%|          | 0/88946 [00:00<?, ?it/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔥 Hard Negative Reranker 학습 시작 (총 355784 샘플)


  scaler = GradScaler()


Epoch 1:   0%|          | 0/22237 [00:00<?, ?it/s]

  with autocast():


✨ Epoch 1 저장 완료: /content/drive/MyDrive/P02_SemanticParsing/saved_models_final/reranker_hard_v3/epoch_1


Epoch 2:   0%|          | 0/22237 [00:00<?, ?it/s]

✨ Epoch 2 저장 완료: /content/drive/MyDrive/P02_SemanticParsing/saved_models_final/reranker_hard_v3/epoch_2


Epoch 3:   0%|          | 0/22237 [00:00<?, ?it/s]

✨ Epoch 3 저장 완료: /content/drive/MyDrive/P02_SemanticParsing/saved_models_final/reranker_hard_v3/epoch_3
🎉 모든 학습 완료! 최종 평가를 다시 돌려보세요.


In [None]:
import json
import torch
import os
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi

# =============================================================================
# [1] 경로 설정 (★여기가 제일 중요합니다★)
# =============================================================================
WORK_DIR = "/content/drive/MyDrive/P02_SemanticParsing"
BASE_DIR = f"{WORK_DIR}/nia"

# 1. Retriever: 69% 성능을 보여줬던 그 녀석 (v2)
RETRIEVER_PATH = f"{WORK_DIR}/saved_models_final/retriever_finetuned_v2"

# 2. Reranker: 방금 Hard Negative 학습을 마친 따끈따끈한 녀석 (v3)
RERANKER_PATH = f"{WORK_DIR}/saved_models_final/reranker_hard_v3/epoch_3"

VALID_PATH = f"{BASE_DIR}/valid.json"
TABLE_PATH = f"{BASE_DIR}/tables.json"
TEST_LIMIT = 500 # 전체 평가
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"🔥 최종 결전 평가 시작 (Device: {device})")
print(f"👉 Reranker: {RERANKER_PATH}")

# =============================================================================
# [2] 스키마 텍스트 생성 (학습 포맷과 100% 일치)
# =============================================================================
def make_schema_text(t):
    t_name_ko = t.get('table_names_original', "")
    if isinstance(t_name_ko, list): t_name_ko = t_name_ko[0]
    t_name_en = t.get('table_names', "")
    if isinstance(t_name_en, list): t_name_en = t_name_en[0]
    c_names = [c[1] for c in t.get('column_names', [])]
    return f"테이블: {t_name_ko} ({t_name_en}) | 컬럼: {', '.join(c_names)}"

def bigram_tokenizer(text):
    text = str(text).replace(" ", "")
    return [text[i:i+2] for i in range(len(text)-1)]

# =============================================================================
# [3] 모델 및 데이터 로드
# =============================================================================
print("📂 모델 로딩 중...")
vector_model = SentenceTransformer(RETRIEVER_PATH)
tokenizer = AutoTokenizer.from_pretrained(RERANKER_PATH)
reranker_model = AutoModelForSequenceClassification.from_pretrained(RERANKER_PATH).to(device).eval()

with open(TABLE_PATH, 'r', encoding='utf-8') as f:
    table_list = json.load(f)
    all_tables = {t['db_id']: t for t in table_list}

with open(VALID_PATH, 'r', encoding='utf-8') as f:
    valid_data = json.load(f)[:TEST_LIMIT]

# =============================================================================
# [4] 인덱싱 (Indexing)
# =============================================================================
print("📂 인덱싱 구축 중...")
bm25_corpus = []
vector_corpus = []
db_ids = []

for t in tqdm(table_list, desc="Indexing"):
    rich_text = make_schema_text(t)
    vector_corpus.append(rich_text)

    t_name_ko = t.get('table_names_original', "")
    if isinstance(t_name_ko, list): t_name_ko = t_name_ko[0]
    c_names = [c[1] for c in t.get('column_names', [])]
    bm25_text = f"{t_name_ko} {t_name_ko} {' '.join(c_names)}"
    bm25_corpus.append(bigram_tokenizer(bm25_text))

    db_ids.append(t['db_id'])

bm25 = BM25Okapi(bm25_corpus)
corpus_embeddings = vector_model.encode(vector_corpus, convert_to_tensor=True, show_progress_bar=True).to('cpu')

# =============================================================================
# [5] 평가 루프
# =============================================================================
stats = {"total": 0, "recall_union": 0, "top1": 0, "top5": 0}

for item in tqdm(valid_data, desc="Evaluating"):
    question = item['question']
    gold_id = item['db_id']
    stats["total"] += 1

    # [Step 1] Candidate Generation
    candidates = set()
    q_tokens = bigram_tokenizer(question)
    if q_tokens:
        candidates.update(bm25.get_top_n(q_tokens, db_ids, n=50))

    q_emb = vector_model.encode(question, convert_to_tensor=True).to('cpu')
    cos_scores = util.cos_sim(q_emb, corpus_embeddings)[0]
    top_vec = torch.topk(cos_scores, k=50)
    candidates.update([db_ids[i] for i in top_vec.indices.tolist()])

    candidate_list = list(candidates)

    if gold_id in candidate_list:
        stats["recall_union"] += 1
    else:
        continue

    # [Step 2] Reranking
    rerank_scores = []
    with torch.no_grad():
        # Batch 처리가 아니므로 속도가 조금 느릴 수 있지만 정확도는 확실함
        for tid in candidate_list:
            t_info = all_tables[tid]
            schema_input = make_schema_text(t_info)
            inputs = tokenizer(question, schema_input, return_tensors="pt", truncation=True, max_length=512).to(device)
            score = reranker_model(**inputs).logits[0][1].item()
            rerank_scores.append((tid, score))

    rerank_scores.sort(key=lambda x: x[1], reverse=True)
    ranked_ids = [s[0] for s in rerank_scores]

    if gold_id == ranked_ids[0]: stats["top1"] += 1
    if gold_id in ranked_ids[:5]: stats["top5"] += 1

# =============================================================================
# [6] 결과 출력
# =============================================================================
t = stats["total"]
print("\n" + "="*60)
print(f"🏆 Hard Negative 학습 후 최종 결과")
print("-" * 60)
print(f"✅ Recall (Target: 70%): {stats['recall_union']/t*100:.2f}%")
print("-" * 60)
print(f"🥇 Top-1 Accuracy (Target: 50%+): {stats['top1']/t*100:.2f}%")
print(f"🖐️ Top-5 Accuracy:               {stats['top5']/t*100:.2f}%")
print("="*60)

🔥 최종 결전 평가 시작 (Device: cuda)
👉 Reranker: /content/drive/MyDrive/P02_SemanticParsing/saved_models_final/reranker_hard_v3/epoch_3
📂 모델 로딩 중...


The tokenizer you are loading from '/content/drive/MyDrive/P02_SemanticParsing/saved_models_final/retriever_finetuned_v2' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


📂 인덱싱 구축 중...


Indexing:   0%|          | 0/5761 [00:00<?, ?it/s]

Batches:   0%|          | 0/181 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/500 [00:00<?, ?it/s]


🏆 Hard Negative 학습 후 최종 결과
------------------------------------------------------------
✅ Recall (Target: 70%): 67.60%
------------------------------------------------------------
🥇 Top-1 Accuracy (Target: 50%+): 18.20%
🖐️ Top-5 Accuracy:               25.40%


In [None]:
import torch
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi
from tqdm.auto import tqdm

# =============================================================================
# [1] 설정
# =============================================================================
WORK_DIR = "/content/drive/MyDrive/P02_SemanticParsing"
BASE_DIR = f"{WORK_DIR}/nia"

RETRIEVER_PATH = f"{WORK_DIR}/saved_models_final/retriever_finetuned_v2"
RERANKER_PATH = f"{WORK_DIR}/saved_models_final/reranker_hard_v3/epoch_3" # 문제의 모델
VALID_PATH = f"{BASE_DIR}/valid.json"
TABLE_PATH = f"{BASE_DIR}/tables.json"
device = "cuda" if torch.cuda.is_available() else "cpu"

# =============================================================================
# [2] 함수 및 로드
# =============================================================================
def make_schema_text(t):
    t_name_ko = t.get('table_names_original', "")
    if isinstance(t_name_ko, list): t_name_ko = t_name_ko[0]
    c_names = [c[1] for c in t.get('column_names', [])]
    # ★ 길이 체크를 위해 여기서 출력 안함, 아래에서 직접 확인
    return f"테이블: {t_name_ko} | 컬럼: {', '.join(c_names)}"

def bigram_tokenizer(text):
    text = str(text).replace(" ", "")
    return [text[i:i+2] for i in range(len(text)-1)]

print("📂 모델 로딩...")
vector_model = SentenceTransformer(RETRIEVER_PATH)
tokenizer = AutoTokenizer.from_pretrained(RERANKER_PATH)
reranker_model = AutoModelForSequenceClassification.from_pretrained(RERANKER_PATH).to(device).eval()

with open(TABLE_PATH, 'r') as f:
    table_list = json.load(f)
    all_tables = {t['db_id']: t for t in table_list}

with open(VALID_PATH, 'r') as f:
    valid_data = json.load(f)[:100] # 100개만 빠르게 분석

# 인덱싱 (간소화)
print("📂 인덱싱...")
vector_corpus = [make_schema_text(t) for t in table_list]
bm25_corpus = [bigram_tokenizer(t) for t in vector_corpus]
db_ids = [t['db_id'] for t in table_list]

bm25 = BM25Okapi(bm25_corpus)
corpus_embeddings = vector_model.encode(vector_corpus, convert_to_tensor=True).to('cpu')

# =============================================================================
# [3] 부검 시작 (Failure Analysis)
# =============================================================================
print("\n🚑 실패 사례 정밀 분석 시작...")

fail_count = 0
for item in valid_data:
    if fail_count >= 3: break # 3개만 보고 멈춤

    question = item['question']
    gold_id = item['db_id']

    # 1. 후보군 생성
    candidates = set()
    candidates.update(bm25.get_top_n(bigram_tokenizer(question), db_ids, n=30))
    q_emb = vector_model.encode(question, convert_to_tensor=True).to('cpu')
    cos_scores = util.cos_sim(q_emb, corpus_embeddings)[0]
    top_vec = torch.topk(cos_scores, k=30)
    candidates.update([db_ids[i] for i in top_vec.indices.tolist()])
    candidate_list = list(candidates)

    # 정답이 후보군에 없으면 분석 불가 (Retriever 문제) -> 패스
    if gold_id not in candidate_list: continue

    # 2. Reranking
    scores = []
    with torch.no_grad():
        for tid in candidate_list:
            schema = make_schema_text(all_tables[tid])
            inputs = tokenizer(question, schema, return_tensors="pt", truncation=True, max_length=512).to(device)
            s = reranker_model(**inputs).logits[0][1].item()
            scores.append((tid, s, schema)) # 스키마 내용도 같이 저장

    scores.sort(key=lambda x: x[1], reverse=True)
    top1_id = scores[0][0]

    # 3. 실패 케이스 포착! (정답이 1등이 아님)
    if top1_id != gold_id:
        fail_count += 1
        gold_entry = next(x for x in scores if x[0] == gold_id)

        print(f"\n❌ [Case {fail_count}] 질문: {question}")
        print("-" * 60)
        print(f"🔴 정답 (Rank {scores.index(gold_entry)+1}위): {gold_id}")
        print(f"   점수: {gold_entry[1]:.4f}")
        print(f"   내용(앞부분): {gold_entry[2][:100]}...")
        print("-" * 60)
        print(f"🔵 모델 예측 (Rank 1위): {top1_id}")
        print(f"   점수: {scores[0][1]:.4f}")
        print(f"   내용(앞부분): {scores[0][2][:100]}...")

        # ★ 토큰 길이 체크 (Truncation 확인)
        gold_tokens = tokenizer(question, gold_entry[2])['input_ids']
        print(f"⚠️ 정답 데이터 토큰 길이: {len(gold_tokens)} (512 넘으면 잘린 것임!)")
        print("=" * 60)

📂 모델 로딩...


The tokenizer you are loading from '/content/drive/MyDrive/P02_SemanticParsing/saved_models_final/retriever_finetuned_v2' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


📂 인덱싱...

🚑 실패 사례 정밀 분석 시작...

❌ [Case 1] 질문: 3층메디칼약국의 주소를 알려줘
------------------------------------------------------------
🔴 정답 (Rank 3위): seouldata_healthcare_733
   점수: -0.1762
   내용(앞부분): 테이블: TB_PHARMACY_OPERATE_INFO | 컬럼: *, 약국아이디, 주소, 약국명, 대표전화, 월요일 진료 마감 시간, 화요일 진료 마감 시간, 수요일 진료 마감 시...
------------------------------------------------------------
🔵 모델 예측 (Rank 1위): seouldata_healthcare_5339
   점수: 0.1520
   내용(앞부분): 테이블: G_B_D_R_U_G_S_T_O_R_E_I_N_F_O | 컬럼: *, 순번, 업종, 약국명, 약국소재지, 약국전화번호...
⚠️ 정답 데이터 토큰 길이: 155 (512 넘으면 잘린 것임!)

❌ [Case 2] 질문: 약국 이름이 일곱 글자인 곳의 주소가 뭐야
------------------------------------------------------------
🔴 정답 (Rank 3위): seouldata_healthcare_733
   점수: 0.2840
   내용(앞부분): 테이블: TB_PHARMACY_OPERATE_INFO | 컬럼: *, 약국아이디, 주소, 약국명, 대표전화, 월요일 진료 마감 시간, 화요일 진료 마감 시간, 수요일 진료 마감 시...
------------------------------------------------------------
🔵 모델 예측 (Rank 1위): publicdata_publicadministration_1382
   점수: 0.5440
   내용(앞부분): 테이블: PHARMACY_ANSAN_GYEONGGI_PROVINCE | 컬럼: *

In [None]:
import os
import json
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.auto import tqdm
from torch.cuda.amp import autocast, GradScaler

# =============================================================================
# [1] 설정
# =============================================================================
WORK_DIR = "/content/drive/MyDrive/P02_SemanticParsing"
BASE_DIR = f"{WORK_DIR}/nia"
TRAIN_PATH = f"{BASE_DIR}/train.json"
TABLE_PATH = f"{BASE_DIR}/tables.json"
SAVE_DIR = f"{WORK_DIR}/saved_models_final/reranker_fixed_v4" # 버전 4 (최종)

MODEL_NAME = "monologg/koelectra-base-v3-discriminator"
BATCH_SIZE = 16
EPOCHS = 3
LR = 3e-5
NEG_RATIO = 3

# =============================================================================
# [2] 데이터셋 (★ 매핑 버그 수정 완료 ★)
# =============================================================================
class KoreanSchemaDataset(Dataset):
    def __init__(self, data_path, table_path, tokenizer, neg_ratio=3):
        self.tokenizer = tokenizer
        self.neg_ratio = neg_ratio

        with open(data_path, 'r', encoding='utf-8') as f:
            self.data = json.load(f)
        with open(table_path, 'r', encoding='utf-8') as f:
            table_list = json.load(f)
            self.tables = {t['db_id']: t for t in table_list}

        self.samples = []
        self._create_samples()
        print(f"📌 학습 데이터 생성 완료: {len(self.samples)}건")

    def _make_schema_text(self, t):
        # 🚨 [버그 수정] NIA 데이터셋 키 매핑 정정
        # table_names -> 한글명 (Korean)
        # table_names_original -> 영문명 (English)

        # 1. 한글 테이블명 (우선순위 1등)
        t_names = t.get('table_names', [])
        t_name_ko = t_names[0] if isinstance(t_names, list) and len(t_names) > 0 else str(t_names)

        # 2. 영문 테이블명 (보조 정보)
        t_names_en = t.get('table_names_original', [])
        t_name_en = t_names_en[0] if isinstance(t_names_en, list) and len(t_names_en) > 0 else str(t_names_en)

        # 3. 컬럼명 (한글)
        c_names = [c[1] for c in t.get('column_names', [])]
        c_text = ", ".join(c_names)

        # ★ 결과: "테이블: 서울시 약국 정보 (TB_PHARM...) | 컬럼: ..."
        return f"테이블: {t_name_ko} ({t_name_en}) | 컬럼: {c_text}"

    def _create_samples(self):
        table_ids = list(self.tables.keys())
        for item in tqdm(self.data, desc="Building Dataset"):
            q = item['question']
            gold_id = item['db_id']

            if gold_id in self.tables:
                self._add_sample(q, gold_id, 1)
                negs = random.sample([t for t in table_ids if t != gold_id], self.neg_ratio)
                for nid in negs:
                    self._add_sample(q, nid, 0)

    def _add_sample(self, q, tid, label):
        t_info = self.tables[tid]
        schema_text = self._make_schema_text(t_info)

        tokenized = self.tokenizer(
            q, schema_text,
            truncation=True, max_length=512, padding=False
        )
        self.samples.append({
            'input_ids': tokenized['input_ids'],
            'token_type_ids': tokenized['token_type_ids'],
            'attention_mask': tokenized['attention_mask'],
            'label': label
        })

    def __len__(self): return len(self.samples)
    def __getitem__(self, idx): return self.samples[idx]

# =============================================================================
# [3] 학습 루프
# =============================================================================
def collate_fn(batch):
    tokenizer.pad_token_id = 0
    max_len = max([len(b['input_ids']) for b in batch])
    input_ids, token_type_ids, attention_mask, labels = [], [], [], []
    for b in batch:
        pad_len = max_len - len(b['input_ids'])
        input_ids.append(b['input_ids'] + [0]*pad_len)
        token_type_ids.append(b['token_type_ids'] + [0]*pad_len)
        attention_mask.append(b['attention_mask'] + [0]*pad_len)
        labels.append(b['label'])
    return {
        'input_ids': torch.tensor(input_ids),
        'token_type_ids': torch.tensor(token_type_ids),
        'attention_mask': torch.tensor(attention_mask),
        'labels': torch.tensor(labels)
    }

if __name__ == '__main__':
    os.makedirs(SAVE_DIR, exist_ok=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"🔥 Reranker 긴급 수정 재학습 (v4) 시작")

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    dataset = KoreanSchemaDataset(TRAIN_PATH, TABLE_PATH, tokenizer, neg_ratio=NEG_RATIO)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)
    optimizer = AdamW(model.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()
    scaler = GradScaler()

    model.train()
    for epoch in range(EPOCHS):
        total_loss = 0
        loop = tqdm(loader, desc=f"Epoch {epoch+1}")
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            with autocast():
                outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()
            loop.set_postfix(loss=f"{loss.item():.4f}")

        # 저장
        model.save_pretrained(f"{SAVE_DIR}/epoch_{epoch+1}")
        tokenizer.save_pretrained(f"{SAVE_DIR}/epoch_{epoch+1}")
        print(f"✨ Epoch {epoch+1} 저장 완료")

🔥 Reranker 긴급 수정 재학습 (v4) 시작


Building Dataset:   0%|          | 0/88946 [00:00<?, ?it/s]

📌 학습 데이터 생성 완료: 355784건


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


Epoch 1:   0%|          | 0/22237 [00:00<?, ?it/s]

  with autocast():


✨ Epoch 1 저장 완료


Epoch 2:   0%|          | 0/22237 [00:00<?, ?it/s]

✨ Epoch 2 저장 완료


Epoch 3:   0%|          | 0/22237 [00:00<?, ?it/s]

✨ Epoch 3 저장 완료


In [None]:
import json
import torch
import os
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi

# =============================================================================
# [1] 설정 & 경로
# =============================================================================
WORK_DIR = "/content/drive/MyDrive/P02_SemanticParsing"
BASE_DIR = f"{WORK_DIR}/nia"

# 1. Retriever: 69% 성능의 v2 (변경 없음)
RETRIEVER_PATH = f"{WORK_DIR}/saved_models_final/retriever_finetuned_v2"

# 2. Reranker: ★ 방금 학습 마친 v4 (매핑 수정버전) ★
RERANKER_PATH = f"{WORK_DIR}/saved_models_final/reranker_fixed_v4/epoch_3"

VALID_PATH = f"{BASE_DIR}/valid.json"
TABLE_PATH = f"{BASE_DIR}/tables.json"
TEST_LIMIT = 500
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"🔥 운명의 최종 평가 시작 (Device: {device})")
print(f"👉 Model: {RERANKER_PATH}")

# =============================================================================
# [2] 스키마 텍스트 생성 (★ 학습 코드와 100% 일치시킴 ★)
# =============================================================================
def make_schema_text(t):
    # 🚨 [중요] 학습 코드와 동일하게 '한글'을 메인으로 배치

    # 1. 한글 테이블명 (table_names)
    t_names = t.get('table_names', [])
    t_name_ko = t_names[0] if isinstance(t_names, list) and len(t_names) > 0 else str(t_names)

    # 2. 영문 테이블명 (table_names_original)
    t_names_en = t.get('table_names_original', [])
    t_name_en = t_names_en[0] if isinstance(t_names_en, list) and len(t_names_en) > 0 else str(t_names_en)

    # 3. 컬럼명
    c_names = [c[1] for c in t.get('column_names', [])]
    c_text = ", ".join(c_names)

    # 결과: "테이블: 서울시 약국 (TB_PHARM...) | 컬럼: ..."
    return f"테이블: {t_name_ko} ({t_name_en}) | 컬럼: {c_text}"

# BM25용 토크나이저
def bigram_tokenizer(text):
    text = str(text).replace(" ", "")
    return [text[i:i+2] for i in range(len(text)-1)]

# =============================================================================
# [3] 모델 로드
# =============================================================================
print("📂 모델 로딩 중...")
vector_model = SentenceTransformer(RETRIEVER_PATH)
tokenizer = AutoTokenizer.from_pretrained(RERANKER_PATH)
reranker_model = AutoModelForSequenceClassification.from_pretrained(RERANKER_PATH).to(device).eval()

with open(TABLE_PATH, 'r', encoding='utf-8') as f:
    table_list = json.load(f)
    all_tables = {t['db_id']: t for t in table_list}

with open(VALID_PATH, 'r', encoding='utf-8') as f:
    valid_data = json.load(f)[:TEST_LIMIT]

# =============================================================================
# [4] 인덱싱
# =============================================================================
print("📂 인덱싱 구축 중...")
bm25_corpus = []
vector_corpus = []
db_ids = []

for t in tqdm(table_list, desc="Indexing"):
    # Reranker/Vector용 (Rich Text)
    rich_text = make_schema_text(t)
    vector_corpus.append(rich_text)

    # BM25용 (한글 이름 강조)
    t_names = t.get('table_names', [])
    t_name_ko = t_names[0] if isinstance(t_names, list) and len(t_names) > 0 else str(t_names)
    c_names = [c[1] for c in t.get('column_names', [])]

    bm25_text = f"{t_name_ko} {t_name_ko} {' '.join(c_names)}"
    bm25_corpus.append(bigram_tokenizer(bm25_text))

    db_ids.append(t['db_id'])

bm25 = BM25Okapi(bm25_corpus)
corpus_embeddings = vector_model.encode(vector_corpus, convert_to_tensor=True, show_progress_bar=True).to('cpu')

# =============================================================================
# [5] 평가 루프
# =============================================================================
stats = {"total": 0, "recall_union": 0, "top1": 0, "top5": 0}

for item in tqdm(valid_data, desc="Evaluating"):
    question = item['question']
    gold_id = item['db_id']
    stats["total"] += 1

    # --- Step 1: Candidate Generation ---
    candidates = set()

    # BM25
    q_tokens = bigram_tokenizer(question)
    if q_tokens:
        candidates.update(bm25.get_top_n(q_tokens, db_ids, n=50))

    # Vector
    q_emb = vector_model.encode(question, convert_to_tensor=True).to('cpu')
    cos_scores = util.cos_sim(q_emb, corpus_embeddings)[0]
    top_vec = torch.topk(cos_scores, k=50)
    candidates.update([db_ids[i] for i in top_vec.indices.tolist()])

    candidate_list = list(candidates)

    if gold_id in candidate_list:
        stats["recall_union"] += 1
    else:
        continue

    # --- Step 2: Reranking ---
    rerank_scores = []
    with torch.no_grad():
        for tid in candidate_list:
            t_info = all_tables[tid]
            # ★ 수정된 매핑 함수 적용 ★
            schema_input = make_schema_text(t_info)

            inputs = tokenizer(question, schema_input, return_tensors="pt", truncation=True, max_length=512).to(device)
            score = reranker_model(**inputs).logits[0][1].item()
            rerank_scores.append((tid, score))

    rerank_scores.sort(key=lambda x: x[1], reverse=True)
    ranked_ids = [s[0] for s in rerank_scores]

    if gold_id == ranked_ids[0]: stats["top1"] += 1
    if gold_id in ranked_ids[:5]: stats["top5"] += 1

# =============================================================================
# [6] 결과 출력
# =============================================================================
t = stats["total"]
print("\n" + "="*60)
print(f"🏆 [최종 성적표] 매핑 수정(v4) 적용 결과")
print("-" * 60)
print(f"✅ Recall (Retrieval):     {stats['recall_union']/t*100:.2f}%")
print("-" * 60)
print(f"🥇 Top-1 Accuracy:        {stats['top1']/t*100:.2f}%")
print(f"🖐️ Top-5 Accuracy:        {stats['top5']/t*100:.2f}%")
print("="*60)

🔥 운명의 최종 평가 시작 (Device: cuda)
👉 Model: /content/drive/MyDrive/P02_SemanticParsing/saved_models_final/reranker_fixed_v4/epoch_3
📂 모델 로딩 중...


The tokenizer you are loading from '/content/drive/MyDrive/P02_SemanticParsing/saved_models_final/retriever_finetuned_v2' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


📂 인덱싱 구축 중...


Indexing:   0%|          | 0/5761 [00:00<?, ?it/s]

Batches:   0%|          | 0/181 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/500 [00:00<?, ?it/s]


🏆 [최종 성적표] 매핑 수정(v4) 적용 결과
------------------------------------------------------------
✅ Recall (Retrieval):     67.00%
------------------------------------------------------------
🥇 Top-1 Accuracy:        17.60%
🖐️ Top-5 Accuracy:        27.40%


In [None]:
import os
import json
import torch
import random
import torch.nn as nn
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
from torch.cuda.amp import autocast, GradScaler

# =============================================================================
# [1] 설정
# =============================================================================
WORK_DIR = "/content/drive/MyDrive/P02_SemanticParsing"
BASE_DIR = f"{WORK_DIR}/nia"
TRAIN_PATH = f"{BASE_DIR}/train.json"
TABLE_PATH = f"{BASE_DIR}/tables.json"
SAVE_DIR = f"{WORK_DIR}/saved_models_final/reranker_v5_final" # 최종 버전

# 채굴용 Retriever (69% 성능의 v2 사용)
RETRIEVER_PATH = f"{WORK_DIR}/saved_models_final/retriever_finetuned_v2"
# 학습할 Base Reranker
MODEL_NAME = "monologg/koelectra-base-v3-discriminator"

BATCH_SIZE = 16
EPOCHS = 3
LR = 2e-5

# =============================================================================
# [2] 스키마 텍스트 생성 (★ v4에서 검증된 올바른 매핑 ★)
# =============================================================================
def make_schema_text(t):
    # 한글 우선 (Korean First)
    t_names = t.get('table_names', [])
    t_name_ko = t_names[0] if isinstance(t_names, list) and len(t_names) > 0 else str(t_names)

    t_names_en = t.get('table_names_original', [])
    t_name_en = t_names_en[0] if isinstance(t_names_en, list) and len(t_names_en) > 0 else str(t_names_en)

    c_names = [c[1] for c in t.get('column_names', [])]
    c_text = ", ".join(c_names)

    return f"테이블: {t_name_ko} ({t_name_en}) | 컬럼: {c_text}"

# =============================================================================
# [3] Hard Negative 채굴 (Mining)
# =============================================================================
print("⛏️ Hard Negative 채굴 시작 (올바른 매핑 적용)...")

# 1. 테이블 로드 및 임베딩
with open(TABLE_PATH, 'r') as f:
    table_list = json.load(f)
    all_tables = {t['db_id']: t for t in table_list}
    all_db_ids = [t['db_id'] for t in table_list]
    # 여기서도 올바른 매핑 사용!
    all_schemas = [make_schema_text(t) for t in table_list]

vector_model = SentenceTransformer(RETRIEVER_PATH)
# 전체 테이블 임베딩 (시간 좀 걸림)
corpus_embeddings = vector_model.encode(all_schemas, convert_to_tensor=True, show_progress_bar=True)

# 2. Mining Loop
mined_data = []
with open(TRAIN_PATH, 'r') as f:
    train_data = json.load(f)

for item in tqdm(train_data, desc="Mining Hard Negatives"):
    question = item['question']
    gold_id = item['db_id']

    # 질문과 유사한 테이블 검색
    q_emb = vector_model.encode(question, convert_to_tensor=True)
    cos_scores = util.cos_sim(q_emb, corpus_embeddings)[0]

    # Top-10개를 가져와서 정답이 아닌 것들을 'Hard Negative'로 선정
    top_results = torch.topk(cos_scores, k=10)
    indices = top_results.indices.tolist()

    hard_negs = []
    for idx in indices:
        cand_id = all_db_ids[idx]
        if cand_id != gold_id:
            hard_negs.append(cand_id)

    # 데이터셋에 추가 (오답 중 상위 3개만 사용)
    if gold_id in all_tables and len(hard_negs) > 0:
        mined_data.append({
            "q": question,
            "pos": gold_id,
            "negs": hard_negs[:3]
        })

print(f"✅ 채굴 완료: {len(mined_data)}건 (고난이도 문제집)")

# =============================================================================
# [4] Reranker 학습 데이터셋 구성
# =============================================================================
class HardNegativeDataset(Dataset):
    def __init__(self, mined_data, tables, tokenizer):
        self.tokenizer = tokenizer
        self.samples = []

        for item in tqdm(mined_data, desc="Tokenizing"):
            q = item['q']
            pos_id = item['pos']
            neg_ids = item['negs']

            # Positive (정답)
            pos_schema = make_schema_text(tables[pos_id])
            tokenized_pos = tokenizer(q, pos_schema, truncation=True, max_length=512)
            self.samples.append({
                'input_ids': tokenized_pos['input_ids'],
                'token_type_ids': tokenized_pos['token_type_ids'],
                'attention_mask': tokenized_pos['attention_mask'],
                'label': 1
            })

            # Negatives (오답)
            for nid in neg_ids:
                neg_schema = make_schema_text(tables[nid])
                tokenized_neg = tokenizer(q, neg_schema, truncation=True, max_length=512)
                self.samples.append({
                    'input_ids': tokenized_neg['input_ids'],
                    'token_type_ids': tokenized_neg['token_type_ids'], # KoELECTRA 필수
                    'attention_mask': tokenized_neg['attention_mask'],
                    'label': 0
                })

    def __len__(self): return len(self.samples)
    def __getitem__(self, idx): return self.samples[idx]

# =============================================================================
# [5] 학습 실행
# =============================================================================
def collate_fn(batch):
    max_len = max([len(b['input_ids']) for b in batch])
    input_ids, token_type_ids, attention_mask, labels = [], [], [], []
    for b in batch:
        pad_len = max_len - len(b['input_ids'])
        input_ids.append(b['input_ids'] + [0]*pad_len)
        token_type_ids.append(b['token_type_ids'] + [0]*pad_len)
        attention_mask.append(b['attention_mask'] + [0]*pad_len)
        labels.append(b['label'])
    return {
        'input_ids': torch.tensor(input_ids),
        'token_type_ids': torch.tensor(token_type_ids),
        'attention_mask': torch.tensor(attention_mask),
        'labels': torch.tensor(labels)
    }

if __name__ == '__main__':
    os.makedirs(SAVE_DIR, exist_ok=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"🔥 Reranker v5 (Hard Negative + Fixed Mapping) 학습 시작")

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    dataset = HardNegativeDataset(mined_data, all_tables, tokenizer)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)
    optimizer = AdamW(model.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()
    scaler = GradScaler()

    model.train()
    for epoch in range(EPOCHS):
        total_loss = 0
        loop = tqdm(loader, desc=f"Epoch {epoch+1}")
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            with autocast():
                outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()
            loop.set_postfix(loss=f"{loss.item():.4f}")

        model.save_pretrained(f"{SAVE_DIR}/epoch_{epoch+1}")
        tokenizer.save_pretrained(f"{SAVE_DIR}/epoch_{epoch+1}")
        print(f"✨ Epoch {epoch+1} 저장 완료")

⛏️ Hard Negative 채굴 시작 (올바른 매핑 적용)...


The tokenizer you are loading from '/content/drive/MyDrive/P02_SemanticParsing/saved_models_final/retriever_finetuned_v2' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


Batches:   0%|          | 0/181 [00:00<?, ?it/s]

Mining Hard Negatives:   0%|          | 0/88946 [00:00<?, ?it/s]

✅ 채굴 완료: 88946건 (고난이도 문제집)
🔥 Reranker v5 (Hard Negative + Fixed Mapping) 학습 시작


Tokenizing:   0%|          | 0/88946 [00:00<?, ?it/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()


Epoch 1:   0%|          | 0/22237 [00:00<?, ?it/s]

  with autocast():


✨ Epoch 1 저장 완료


Epoch 2:   0%|          | 0/22237 [00:00<?, ?it/s]

✨ Epoch 2 저장 완료


Epoch 3:   0%|          | 0/22237 [00:00<?, ?it/s]

✨ Epoch 3 저장 완료


In [None]:
!pip install rank_bm25
import json
import torch
import os
import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi

# =============================================================================
# [1] 설정
# =============================================================================
WORK_DIR = "/content/drive/MyDrive/P02_SemanticParsing"
BASE_DIR = f"{WORK_DIR}/nia"
SAVE_DIR = f"{WORK_DIR}/saved_models_final"

# 비교할 체크포인트들
CHECKPOINTS = [
    ("Epoch_1", f"{SAVE_DIR}/reranker_v5_final/epoch_1"),
    ("Epoch_2", f"{SAVE_DIR}/reranker_v5_final/epoch_2"),
    ("Epoch_3", f"{SAVE_DIR}/reranker_v5_final/epoch_3") # 얘는 아마 점수 낮을 듯
]

RETRIEVER_PATH = f"{SAVE_DIR}/retriever_finetuned_v2"
VALID_PATH = f"{BASE_DIR}/valid.json"
TABLE_PATH = f"{BASE_DIR}/tables.json"
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"🔥 Epoch별 생존자 확인 (Device: {device})")

# =============================================================================
# [2] 준비 (Retriever & Data)
# =============================================================================
def make_schema_text(t):
    # v5 학습 포맷 (한글 우선)
    t_names = t.get('table_names', [
        ""
    ])[0]
    t_names_en = t.get('table_names_original', [
        ""
    ])[0]
    c_names = ", ".join([c[1] for c in t.get('column_names', [])])
    return f"테이블: {t_names} ({t_names_en}) | 컬럼: {c_names}"

def bigram_tokenizer(text):
    text = str(text).replace(" ", "")
    return [text[i:i+2] for i in range(len(text)-1)]

print("📂 데이터 준비 중...")
with open(TABLE_PATH, 'r') as f:
    table_list = json.load(f)
    all_tables = {t['db_id']: t for t in table_list}
with open(VALID_PATH, 'r') as f:
    valid_data = json.load(f)[:300] # 시간 절약을 위해 300개만 테스트

# Retriever 실행 (후보군 고정)
vector_model = SentenceTransformer(RETRIEVER_PATH)
vector_corpus = [make_schema_text(t) for t in table_list]
corpus_embeddings = vector_model.encode(vector_corpus, convert_to_tensor=True).to('cpu')

bm25_corpus = [bigram_tokenizer(t) for t in vector_corpus]
bm25 = BM25Okapi(bm25_corpus)
db_ids = [t['db_id'] for t in table_list]

prepared_data = []
print("⚡ 후보군 추출 중...")
for item in tqdm(valid_data):
    question = item['question']
    gold_id = item['db_id']

    candidates = set()
    q_tokens = bigram_tokenizer(question)
    if q_tokens: candidates.update(bm25.get_top_n(q_tokens, db_ids, n=50))

    q_emb = vector_model.encode(question, convert_to_tensor=True).to('cpu')
    top_vec = torch.topk(util.cos_sim(q_emb, corpus_embeddings)[0], k=50)
    candidates.update([db_ids[i] for i in top_vec.indices.tolist()])

    if gold_id in candidates:
        prepared_data.append({"q": question, "gold": gold_id, "cands": list(candidates)})

del vector_model, corpus_embeddings
torch.cuda.empty_cache()

# =============================================================================
# [3] Epoch별 평가 루프
# =============================================================================
results = []

for name, path in CHECKPOINTS:
    print(f"\n🚀 Evaluating [{name}]...")
    if not os.path.exists(path):
        print(f"⚠️ 저장된 모델 없음: {path}")
        continue

    tokenizer = AutoTokenizer.from_pretrained(path)
    model = AutoModelForSequenceClassification.from_pretrained(path).to(device).eval()

    top1 = 0
    top5 = 0
    total = len(prepared_data)

    for item in tqdm(prepared_data):
        question = item['q']
        cands = item['cands']

        scores = []
        with torch.no_grad():
            for tid in cands:
                schema = make_schema_text(all_tables[tid])
                inputs = tokenizer(question, schema, return_tensors="pt", truncation=True, max_length=512).to(device)
                logit = model(**inputs).logits[0][1].item()
                scores.append((tid, logit))

        scores.sort(key=lambda x: x[1], reverse=True)
        ranked = [s[0] for s in scores]

        if item['gold'] == ranked[0]: top1 += 1
        if item['gold'] in ranked[:5]: top5 += 1

    results.append({
        "Checkpoint": name,
        "Top-1 Acc": f"{top1/total*100:.2f}%",
        "Top-5 Acc": f"{top5/total*100:.2f}%"
    })

    del model, tokenizer
    torch.cuda.empty_cache()

print("\n🏆 Epoch 생존 확인 결과")
df = pd.DataFrame(results)
print(df.to_markdown(index=False))

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
🔥 Epoch별 생존자 확인 (Device: cuda)
📂 데이터 준비 중...


The tokenizer you are loading from '/content/drive/MyDrive/P02_SemanticParsing/saved_models_final/retriever_finetuned_v2' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


⚡ 후보군 추출 중...


  0%|          | 0/300 [00:00<?, ?it/s]


🚀 Evaluating [Epoch_1]...


  0%|          | 0/199 [00:00<?, ?it/s]


🚀 Evaluating [Epoch_2]...


  0%|          | 0/199 [00:00<?, ?it/s]


🚀 Evaluating [Epoch_3]...


  0%|          | 0/199 [00:00<?, ?it/s]


🏆 Epoch 생존 확인 결과
| Checkpoint   | Top-1 Acc   | Top-5 Acc   |
|:-------------|:------------|:------------|
| Epoch_1      | 34.67%      | 43.72%      |
| Epoch_2      | 0.50%       | 4.52%       |
| Epoch_3      | 0.00%       | 0.50%       |


In [None]:
# 파일명: 18_train_schema_linking_fix.ipynb
import os
import json
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm
from torch.cuda.amp import autocast, GradScaler

# =============================================================================
# [1] 설정
# =============================================================================
WORK_DIR = "/content/drive/MyDrive/P02_SemanticParsing"
BASE_DIR = f"{WORK_DIR}/nia"
TRAIN_PATH = f"{BASE_DIR}/train.json"
TABLE_PATH = f"{BASE_DIR}/tables.json"
SAVE_DIR = f"{WORK_DIR}/saved_models_final/schema_linker_e5"

MODEL_NAME = "upskyy/e5-base-korean"
BATCH_SIZE = 32
EPOCHS = 3
LR = 2e-5
MAX_LEN = 128
TEMPERATURE = 0.05

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🔥 Schema Linking 학습 재시작 (Fixed) (Device: {device})")

# =============================================================================
# [2] 데이터 전처리 (버그 수정됨 ★)
# =============================================================================
def load_data(train_path, table_path):
    with open(train_path, 'r') as f:
        train_data = json.load(f)
    with open(table_path, 'r') as f:
        table_list = json.load(f)
        tables = {t['db_id']: t for t in table_list}
    return train_data, tables

def extract_gold_columns(sql_dict):
    """
    Spider SQL 포맷의 중첩 구조를 안전하게 파싱하여 사용된 컬럼 인덱스를 추출합니다.
    """
    used_cols = set()

    # Helper: val_unit에서 컬럼 추출 ([unit_op, col_unit1, col_unit2])
    def get_col_from_val_unit(val_unit):
        if not val_unit: return
        # col_unit1: [agg, col_id, is_distinct]
        col_unit1 = val_unit[1]
        if col_unit1: used_cols.add(col_unit1[1])
        col_unit2 = val_unit[2]
        if col_unit2: used_cols.add(col_unit2[1])

    # 1. SELECT 절: [is_distinct, [[agg, val_unit], ...]]
    if 'select' in sql_dict:
        select_info = sql_dict['select']
        # index 0은 bool(distinct), index 1이 실제 리스트
        if len(select_info) > 1 and isinstance(select_info[1], list):
            for agg, val_unit in select_info[1]:
                get_col_from_val_unit(val_unit)

    # 2. WHERE 절: [[not, op, val_unit, val1, val2], ...] (조건들의 리스트)
    if 'where' in sql_dict:
        for cond in sql_dict['where']:
            # 조건절은 보통 리스트 형태이며 길이가 3 이상 (AND/OR 등 연결어 제외)
            if isinstance(cond, list) and len(cond) > 2:
                # index 2가 val_unit
                get_col_from_val_unit(cond[2])

    # 3. GROUP BY: [[col_unit...], ...]
    if 'groupBy' in sql_dict and sql_dict['groupBy']:
        for col_unit in sql_dict['groupBy']:
            if col_unit: used_cols.add(col_unit[1])

    # 4. ORDER BY: [order, [val_unit, ...]]
    if 'orderBy' in sql_dict and sql_dict['orderBy']:
        order_info = sql_dict['orderBy']
        if len(order_info) > 1 and isinstance(order_info[1], list):
            for val_unit in order_info[1]:
                get_col_from_val_unit(val_unit)

    # 5. HAVING
    if 'having' in sql_dict and sql_dict['having']:
        for cond in sql_dict['having']:
            if isinstance(cond, list) and len(cond) > 2:
                get_col_from_val_unit(cond[2])

    # 0번 인덱스(*) 제외
    if 0 in used_cols: used_cols.remove(0)

    return list(used_cols)

# =============================================================================
# [3] 데이터셋
# =============================================================================
class SchemaLinkingDataset(Dataset):
    def __init__(self, train_data, tables, tokenizer):
        self.tokenizer = tokenizer
        self.pairs = []

        print("📂 학습 데이터 쌍 생성 중...")
        for item in tqdm(train_data):
            db_id = item['db_id']
            question = item['question']

            if db_id not in tables: continue
            table_info = tables[db_id]

            # 버그 수정된 함수로 정답 컬럼 추출
            gold_col_indices = extract_gold_columns(item['sql'])

            if not gold_col_indices: continue # 정답 컬럼이 없으면 스킵 (*만 있는 경우 등)

            for idx in gold_col_indices:
                # 테이블/컬럼 정보 안전하게 가져오기
                t_names = table_info.get('table_names_original', ["UNKNOWN"])
                t_name_en = t_names[0] if isinstance(t_names, list) else str(t_names)

                c_names_en = table_info['column_names_original']
                c_names_ko = table_info['column_names']

                # 인덱스 범위 체크 (데이터셋 오류 방지)
                if idx >= len(c_names_en): continue

                c_name_en_str = c_names_en[idx][1]
                c_name_ko_str = c_names_ko[idx][1]

                # "query: ..." / "passage: TB_NAME.COL_NAME(한글명)"
                q_text = f"query: {question}"
                p_text = f"passage: {t_name_en}.{c_name_en_str}({c_name_ko_str})"

                self.pairs.append((q_text, p_text))

        print(f"✅ 총 {len(self.pairs)}개의 (질문, 정답컬럼) 데이터 생성 완료!")

    def __len__(self): return len(self.pairs)
    def __getitem__(self, idx): return self.pairs[idx]

def collate_fn(batch):
    queries = [b[0] for b in batch]
    passages = [b[1] for b in batch]
    tok_q = tokenizer(queries, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt")
    tok_p = tokenizer(passages, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt")
    return tok_q, tok_p

# =============================================================================
# [4] 모델 & 학습
# =============================================================================
class E5Model(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = outputs.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden.size()).float()
        sum_embeddings = torch.sum(last_hidden * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        embeddings = sum_embeddings / sum_mask
        return torch.nn.functional.normalize(embeddings, p=2, dim=1)

if __name__ == '__main__':
    train_data, tables = load_data(TRAIN_PATH, TABLE_PATH)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # 데이터셋 생성 (수정된 로직 적용)
    dataset = SchemaLinkingDataset(train_data, tables, tokenizer)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

    model = E5Model(MODEL_NAME).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
    loss_fn = nn.CrossEntropyLoss()
    scaler = GradScaler()

    print("\n🚀 학습 시작...")
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        loop = tqdm(loader, desc=f"Epoch {epoch+1}")

        for tok_q, tok_p in loop:
            tok_q = {k: v.to(device) for k, v in tok_q.items()}
            tok_p = {k: v.to(device) for k, v in tok_p.items()}

            optimizer.zero_grad()
            with autocast():
                q_emb = model(tok_q['input_ids'], tok_q['attention_mask'])
                p_emb = model(tok_p['input_ids'], tok_p['attention_mask'])
                sim_scores = torch.matmul(q_emb, p_emb.T) / TEMPERATURE
                labels = torch.arange(sim_scores.size(0)).to(device)
                loss = loss_fn(sim_scores, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()
            loop.set_postfix(loss=f"{loss.item():.4f}")

        save_path = f"{SAVE_DIR}/epoch_{epoch+1}"
        os.makedirs(save_path, exist_ok=True)
        model.backbone.save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)
        print(f"✨ Epoch {epoch+1} 저장 완료")

    print("🎉 학습 완료! 이제 Top-5 컬럼 검색이 가능합니다.")

🔥 Schema Linking 학습 재시작 (Fixed) (Device: cuda)
📂 학습 데이터 쌍 생성 중...


  0%|          | 0/88946 [00:00<?, ?it/s]

✅ 총 220491개의 (질문, 정답컬럼) 데이터 생성 완료!


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]


🚀 학습 시작...


  scaler = GradScaler()


Epoch 1:   0%|          | 0/6891 [00:00<?, ?it/s]

  with autocast():


✨ Epoch 1 저장 완료


Epoch 2:   0%|          | 0/6891 [00:00<?, ?it/s]

✨ Epoch 2 저장 완료


Epoch 3:   0%|          | 0/6891 [00:00<?, ?it/s]

✨ Epoch 3 저장 완료
🎉 학습 완료! 이제 Top-5 컬럼 검색이 가능합니다.
