# Team7 Assignment 2: Kaggle Inference

### Setting and Dataset Load (C2-Only Version)

In [1]:
import os, numpy as np, pandas as pd
import torch
import joblib, time
from sentence_transformers import SentenceTransformer
from scipy.special import softmax

# =========================================================================
# Kaggle Dataset 경로 설정
# =========================================================================
KAGGLE_MODEL_DIR = "/kaggle/input/models/models"
KAGGLE_DATA_DIR = "/kaggle/input/llm-classification-finetuning"
BASE_DIR = KAGGLE_MODEL_DIR

# Data Load (Inference에는 test와 sample만 필요)
test  = pd.read_csv(f"{KAGGLE_DATA_DIR}/test.csv")
sample = pd.read_csv(f"{KAGGLE_DATA_DIR}/sample_submission.csv")

print("DATA:", KAGGLE_DATA_DIR, test.shape)

random_state = 20010815
device = "cuda" if torch.cuda.is_available() else "cpu"

2025-11-06 10:49:30.201743: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762426170.402188      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762426170.454973      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


DATA: /kaggle/input/llm-classification-finetuning (3, 4)


### Global Functions (C2-Required Only)

In [2]:
import time
import random

# Set random seeds for reproducibility
np.random.seed(random_state)
random.seed(random_state)
torch.manual_seed(random_state)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_state)

from sentence_transformers import SentenceTransformer
def load_model(candidates, idx=0, device="cpu"):
    last_err = None
    path = candidates[idx]
    try:
        print("try:", path)
        model = SentenceTransformer(path, device=device)
        print("loaded model from:", path)
        return model, path
    except Exception as e:
        last_err = e
    raise RuntimeError("Failed to load model. " + str(last_err))

def build_feat(P, A, B):
    AB_diff = A - B
    AB_adiff = np.abs(AB_diff)
    AB_mul = A * B
    PA_mul = P * A
    PB_mul = P * B
    return np.hstack([P, A, B, AB_diff, AB_adiff, AB_mul, PA_mul, PB_mul])

def l2norm(a, eps=1e-12):
    n = np.linalg.norm(a, axis=1, keepdims=True)
    n = np.clip(n, eps, None)
    return a / n

def encode_texts(model, texts, batch_size=256):
    vecs = []
    total_texts = len(texts)
    total_batches = (total_texts + batch_size - 1) // batch_size

    for i in range(0, len(texts), batch_size):
        start_time = time.time()
        batch = texts[i:i+batch_size].tolist() if isinstance(texts, pd.Series) else texts[i:i+batch_size]
        v = model.encode(batch, batch_size=len(batch), convert_to_numpy=True, normalize_embeddings=False, show_progress_bar=False)
        vecs.append(v)

        batch_num = (i // batch_size) + 1
        print(f"{batch_num}/{total_batches} | time: {time.time() - start_time:.2f}s", end='\r', flush=True)
    V = np.vstack(vecs)
    return l2norm(V)

def create_and_save_submission(predictions, filename, test_df, sample_df):
    print(f"Creating submission file: {filename}...")
    sub_df = pd.DataFrame({
        "id": test_df["id"],
        "winner_model_a": predictions[:, 0],
        "winner_model_b": predictions[:, 1],
        "winner_tie":     predictions[:, 2],
    })
    probs = sub_df[["winner_model_a", "winner_model_b", "winner_tie"]].values
    row_sums = probs.sum(axis=1, keepdims=True)
    probs = probs / np.clip(row_sums, 1e-15, None)
    sub_df[["winner_model_a", "winner_model_b", "winner_tie"]] = probs
    try:
        sub_df = sub_df[sample_df.columns]
    except KeyError as e:
        print(f"Warning: Columns in sample_df not found. Error: {e}")
    sub_df.to_csv(filename, index=False)
    try:
        chk = pd.read_csv(filename)
        assert list(chk.columns) == list(sample_df.columns)
        assert not chk.isna().any().any()
        prob_cols = ["winner_model_a", "winner_model_b", "winner_tie"]
        assert np.allclose(chk[prob_cols].sum(1).values, 1.0)
        print(f"Successfully saved and verified: {filename} (Shape: {sub_df.shape})")
    except Exception as e:
        print(f"Error: Submission file verification failed! {e}")
    return sub_df

print("All C2-related functions loaded.")

All C2-related functions loaded.


### Step 1. Generate C2 (XGBoost) Predictions

In [3]:
### C2: PLM + XGBoost (Calibrated) ###
MODEL_NAME_C2 = "e5-base-v2"
C2_MODEL_PATH = f"{BASE_DIR}/{MODEL_NAME_C2}"
GBM_CHOICE = "XGBOOST"
C2_CALIBRATED_MODEL_PATH = f"{BASE_DIR}/candidate_2_{GBM_CHOICE}_{MODEL_NAME_C2}_CALIBRATED.pkl"

print("--- Candidate 2: PLM + XGBoost ---")
final_test_pred = None

try:
    # 1. Load Embedding Model and Generate Test Features (X_test_c2)
    sbert_model, _ = load_model([C2_MODEL_PATH], idx=0, device=device)
    prompt_emb_te = encode_texts(sbert_model, test["prompt"])
    a_emb_te = encode_texts(sbert_model, test["response_a"])
    b_emb_te = encode_texts(sbert_model, test["response_b"])
    X_test_c2 = build_feat(prompt_emb_te, a_emb_te, b_emb_te)
    del sbert_model, prompt_emb_te, a_emb_te, b_emb_te
    print("C2 test features (X_test_c2) generated.")
    
    # 2. Load Final Calibrated Model and Predict
    calibrated_final_c2 = joblib.load(C2_CALIBRATED_MODEL_PATH)
    final_test_pred = calibrated_final_c2.predict_proba(X_test_c2)
    print("C2 test predictions successfully generated and calibrated.")
    
except Exception as e:
    print(f"ERROR in C2 processing: {e}")
    final_test_pred = np.zeros((len(test), 3)) # Placeholder
    
if device == 'cuda': torch.cuda.empty_cache()

--- Candidate 2: PLM + XGBoost ---
try: /kaggle/input/models/models/e5-base-v2




loaded model from: /kaggle/input/models/models/e5-base-v2
C2 test features (X_test_c2) generated.
C2 test predictions successfully generated and calibrated.


configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


### Step 2. Create Submission File

In [4]:
print("\n" + "="*80)
print("GENERATING FINAL SUBMISSION (C2-ONLY)")
print("="*80)

# final_test_pred 변수가 C2 예측 단계에서 생성되었는지 확인
if 'final_test_pred' in locals() and final_test_pred is not None and final_test_pred.shape[0] == len(test):
    print(f"\nFinal predictions shape: {final_test_pred.shape}")
    print(f"Probability sum check (first 3): {final_test_pred[:3].sum(axis=1)}")

    final_filename = f"submission.csv"
    create_and_save_submission(
        predictions=final_test_pred,
        filename=final_filename,
        test_df=test,
        sample_df=sample
    )

    print("\n" + "="*80)
    print("FINAL SUBMISSION CREATED")
    print(f"File: {final_filename}")
    print("Method: C2 (XGBoost) ONLY")
    print("="*80)
else:
    print("\nERROR: C2 predictions not found or shape mismatch.")
    print("Submission file not created.")



GENERATING FINAL SUBMISSION (C2-ONLY)

Final predictions shape: (3, 3)
Probability sum check (first 3): [1. 1. 1.]
Creating submission file: submission.csv...
Successfully saved and verified: submission.csv (Shape: (3, 4))

FINAL SUBMISSION CREATED
File: submission.csv
Method: C2 (XGBoost) ONLY
