# Team7 Assiginment2

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

import torch
def check_gpus_pytorch():
    # PyTorch가 CUDA를 지원하는지 확인
    if torch.cuda.is_available():
        # 사용 가능한 GPU 개수 확인
        gpu_count = torch.cuda.device_count()
        print(f"✅ PyTorch에서 인식하는 총 GPU 개수: **{gpu_count}**개")

        # 각 GPU의 이름 확인
        for i in range(gpu_count):
            print(f"   - GPU {i}: {torch.cuda.get_device_name(i)}")
    else:
        print("❌ PyTorch가 CUDA(GPU)를 지원하지 않습니다.")
check_gpus_pytorch()


✅ PyTorch에서 인식하는 총 GPU 개수: **1**개
   - GPU 0: NVIDIA RTX A5000


### Setting and Dataset Load

In [2]:
import os, numpy as np, pandas as pd
import torch

LOCAL, KAGGLE = ".", "/kaggle/input/llm-classification-finetuning"
DATA = LOCAL if os.path.exists("./train.csv") else KAGGLE
train = pd.read_csv(f"{DATA}/train.csv")
test  = pd.read_csv(f"{DATA}/test.csv")
sample = pd.read_csv(f"{DATA}/sample_submission.csv")

need = {"prompt","response_a","response_b","winner_model_a","winner_model_b","winner_tie"}
assert need.issubset(set(train.columns)), f"column: {need - set(train.columns)} is missing in train.csv"
print("DATA:", DATA, train.shape, test.shape)

# target (y)
# 0: model_a win, 1: model_b win, 2: tie
y = train[["winner_model_a", "winner_model_b", "winner_tie"]].values.argmax(1)

device = "cuda" if torch.cuda.is_available() else "cpu"
random_state = 20010815
val_size = 0.2

print(device)

DATA: . (57477, 9) (3, 4)
cuda


In [3]:

### Global Functions ###
import time
import random

# Set random seeds for reproducibility
np.random.seed(random_state)
random.seed(random_state)
torch.manual_seed(random_state)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_state)

from sentence_transformers import SentenceTransformer
def load_model(candidates, idx=0, device="cpu"):
    # Load
    last_err = None
    path = candidates[idx]
    try:
        print("try:", path)
        model = SentenceTransformer(path, device=device)
        print("loaded model from:", path)
        return model, path
    except Exception as e:
        last_err = e
    raise RuntimeError("Failed to load model. In Kaggle, you need to upload the model folder to Datasets and then link it via 'Add data'. Last error: " + str(last_err))


def build_feat(P, A, B):
    AB_diff = A - B
    AB_adiff = np.abs(AB_diff)
    AB_mul = A * B
    PA_mul = P * A
    PB_mul = P * B
    return np.hstack([P, A, B, AB_diff, AB_adiff, AB_mul, PA_mul, PB_mul])


def l2norm(a, eps=1e-12):
    n = np.linalg.norm(a, axis=1, keepdims=True)
    n = np.clip(n, eps, None)
    return a / n

def encode_texts(model, texts, batch_size=256):
    vecs = []
    total_texts = len(texts)
    total_batches = (total_texts + batch_size - 1) // batch_size

    for i in range(0, len(texts), batch_size):
        start_time = time.time()
        batch = texts[i:i+batch_size].tolist() if isinstance(texts, pd.Series) else texts[i:i+batch_size]
        v = model.encode(batch, batch_size=len(batch), convert_to_numpy=True, normalize_embeddings=False, show_progress_bar=False)
        vecs.append(v)

        batch_num = (i // batch_size) + 1
        print(f"{batch_num}/{total_batches} | time: {time.time() - start_time:.2f}s", end='\r', flush=True)
    V = np.vstack(vecs)
    return l2norm(V)


def create_and_save_submission(predictions, filename, test_df, sample_df):
    """
    Creates a Kaggle submission file from model predictions.
    Then, it normalizes the probabilities, performs validation checks, and saves the file.
    Args:
        predictions (np.array): return value of predict_proba() (N, 3)
        filename (str): csv filename to save the submission.
        test_df (pd.DataFrame): dataframe containing 'id' column.
        sample_df (pd.DataFrame): dataframe to align columns with sample submission.
    """
    print(f"Creating submission file: {filename}...")
    
    # 1. Save Submission File
    sub_df = pd.DataFrame({
        "id": test_df["id"],
        "winner_model_a": predictions[:, 0],
        "winner_model_b": predictions[:, 1],
        "winner_tie":     predictions[:, 2],
    })

    # 2. Normalization check (safety)
    probs = sub_df[["winner_model_a", "winner_model_b", "winner_tie"]].values
    row_sums = probs.sum(axis=1, keepdims=True)
    probs = probs / np.clip(row_sums, 1e-15, None)
    sub_df[["winner_model_a", "winner_model_b", "winner_tie"]] = probs

    # 3. Align columns with sample submission
    try:
        sub_df = sub_df[sample_df.columns]
    except KeyError as e:
        print(f"Warning: Columns in sample_df not found. Saving with default columns. Error: {e}")

    # 4. Save
    sub_df.to_csv(filename, index=False)

    # 5. Assertions to check file integrity
    try:
        chk = pd.read_csv(filename)
        
        assert list(chk.columns) == list(sample_df.columns), \
            f"Column mismatch. Expected: {list(sample_df.columns)}, Got: {list(chk.columns)}"
        
        assert not chk.isna().any().any(), "NaN values found in submission file."
        
        prob_cols = ["winner_model_a", "winner_model_b", "winner_tie"]
        assert np.allclose(chk[prob_cols].sum(1).values, 1.0), \
            "Probabilities do not sum to 1.0 for all rows."
            
        print(f"Successfully saved and verified: {filename} (Shape: {sub_df.shape})")
        
    except FileNotFoundError:
        print(f"Error: File not found after saving: {filename}")
    except AssertionError as e:
        print(f"Error: Submission file verification failed! {e}")
    
    return sub_df

print("All functions loaded.")


  from .autonotebook import tqdm as notebook_tqdm


All functions loaded.


### Step0. Equal Distribution
All possibility are same

In [None]:
sub = pd.DataFrame({
    "id": test["id"],
    "winner_model_a": np.full(len(test), 1/3),
    "winner_model_b": np.full(len(test), 1/3),
    "winner_tie":     np.full(len(test), 1/3),
})
# sample column order alignment
if set(sample.columns) == set(sub.columns):
    sub = sub[sample.columns]
sub.head()


Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060,0.333333,0.333333,0.333333
1,211333,0.333333,0.333333,0.333333
2,1233961,0.333333,0.333333,0.333333


## Step 1. Baseline Model
a simple baseline using lexical/length features and Logistic Regression.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import os

def simple_stats(s):
    """Calculates basic statistics for the simple baseline."""
    if not isinstance(s, str): s = ""
    toks = s.split()
    return {
        "len_char": len(s),
        "len_tok": len(toks),
        "num_sent": sum(s.count(x) for x in [".", "!", "?"]), # Incomplete sentence count
    }

def build_simple_features(df):
    """Builds minimal features for the Step 1 Simple Baseline."""
    rows = []
    # Use 'response_a' for train, 'response_A' for test if columns differ
    if "response_a" in df.columns:
        cols = ["prompt", "response_a", "response_b"]
    else:
        cols = ["prompt", "response_A", "response_B"]
        
    for p, a, b in zip(df[cols[0]], df[cols[1]], df[cols[2]]):
        ps = simple_stats(p)
        as_ = simple_stats(a)
        bs = simple_stats(b)
        
        rows.append({
            # Prompt features
            "p_len_char": ps["len_char"],
            
            # Response A features
            "a_len_char": as_["len_char"],
            "a_len_tok": as_["len_tok"],
            "a_num_sent": as_["num_sent"],
            
            # Response B features
            "b_len_char": bs["len_char"],
            "b_len_tok": bs["len_tok"],
            "b_num_sent": bs["num_sent"],
            
            # Difference features (A-B)
            "d_len_char": as_["len_char"] - bs["len_char"],
            "d_len_tok": as_["len_tok"] - bs["len_tok"],
            "d_num_sent": as_["num_sent"] - bs["num_sent"],
        })
    return pd.DataFrame(rows)

print("Building SIMPLE features for Step 1...")
X_train_simple = build_simple_features(train)
X_test_simple = build_simple_features(test)
print(f"X_train (simple) shape: {X_train_simple.shape}, X_test (simple) shape: {X_test_simple.shape}")

# Handle NaN/inf as 0
X_train_simple = X_train_simple.fillna(0).replace([np.inf, -np.inf], 0)
X_test_simple = X_test_simple.fillna(0).replace([np.inf, -np.inf], 0)

print("\n--- Validating Simple Baseline (Step 1) ---")

# Data split for validation
X_tr, X_va, y_tr, y_va = train_test_split(X_train_simple, y, test_size=val_size, random_state=random_state, stratify=y)

# Scaling for validation
scaler = StandardScaler()
X_tr_sc = scaler.fit_transform(X_tr)
X_va_sc = scaler.transform(X_va)

# Model Training and Evaluation
clf = LogisticRegression(max_iter=2000, multi_class="multinomial")
clf.fit(X_tr_sc, y_tr)
va_pred = clf.predict_proba(X_va_sc)

print(f"Validation LogLoss: {log_loss(y_va, va_pred):.6f}")

# (Optional) Check Feature Importance
print("\nFeature Coefficients (Class 0: winner_model_a):")
for name, coef in zip(X_train_simple.columns, clf.coef_[0]):
    print(f"{name:15s} {coef: .4f}")

Building SIMPLE features for Step 1...
X_train (simple) shape: (57477, 10), X_test (simple) shape: (3, 10)

--- Validating Simple Baseline (Step 1) ---
Validation LogLoss: 1.070572

Feature Coefficients (Class 0: winner_model_a):
p_len_char      -0.0140
a_len_char       0.0631
a_len_tok        0.0225
a_num_sent       0.0298
b_len_char      -0.0132
b_len_tok       -0.0271
b_num_sent      -0.0343
d_len_char       0.1060
d_len_tok        0.0688
d_num_sent       0.0797




In [23]:
print("\n--- Training Full Model for Step 1 Submission ---")

# Create a new scaler and fit on ALL training data
scaler_full = StandardScaler()
X_all_sc = scaler_full.fit_transform(X_train_simple) # Use cleaned X_train_simple
# Transform the test data using the full scaler
X_test_sc = scaler_full.transform(X_test_simple) # Use cleaned X_test_simple

# Train a new model on ALL training data (full 'y')
clf_full = LogisticRegression(max_iter=2000, multi_class="multinomial")
clf_full.fit(X_all_sc, y) 

print("Full model trained.")

# Predict on Test Data
pred_step1 = clf_full.predict_proba(X_test_sc)
print("Prediction on test set complete.")

create_and_save_submission(
    predictions=pred_step1, 
    filename="submission_step1_simple_baseline.csv",
    test_df=test,
    sample_df=sample
)


--- Training Full Model for Step 1 Submission ---
Full model trained.
Prediction on test set complete.
Creating submission file: submission_step1_simple_baseline.csv...
Successfully saved and verified: submission_step1_simple_baseline.csv (Shape: (3, 4))




Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060,0.325984,0.339632,0.334384
1,211333,0.430757,0.251934,0.317309
2,1233961,0.365975,0.331413,0.302613


### Step 2. Embedding-based Model
- Use a pre-trained sentence embedding model
- Construct prompt+response embeddings and train a classifier.

In [9]:
from sentence_transformers import SentenceTransformer
import os

MODELS = {
    "all-MiniLM-L6-v2":  "sentence-transformers/all-MiniLM-L6-v2",
    "all-MiniLM-L12-v2": "sentence-transformers/all-MiniLM-L12-v2",
    "e5-small-v2":       "intfloat/e5-small-v2",
    "e5-base-v2":        "intfloat/e5-base-v2",
    "e5-large-v2":       "intfloat/e5-large-v2",
    "sentence-t5-base":  "sentence-transformers/sentence-t5-base",
    "sentence-t5-large": "sentence-transformers/sentence-t5-large",
}

BASE_DIR = "./models"
os.makedirs(BASE_DIR, exist_ok=True)

for name, hub_path in MODELS.items():
    save_path = os.path.join(BASE_DIR, name)
    if os.path.exists(save_path) and os.listdir(save_path):
        print(f"[skip] {name} already exists → {save_path}")
        continue
    print(f"[download] {name} from {hub_path}")
    try:
        model = SentenceTransformer(hub_path)
        model.save(save_path)
        print(f" -> saved to {save_path}")
    except Exception as e:
        print(f"[fail] {name}: {e}")

print("=== Model Download Complete (existing ones skipped) ===")


[skip] all-MiniLM-L6-v2 already exists → ./models/all-MiniLM-L6-v2
[skip] all-MiniLM-L12-v2 already exists → ./models/all-MiniLM-L12-v2
[skip] e5-small-v2 already exists → ./models/e5-small-v2
[skip] e5-base-v2 already exists → ./models/e5-base-v2
[skip] e5-large-v2 already exists → ./models/e5-large-v2
[skip] sentence-t5-base already exists → ./models/sentence-t5-base
[skip] sentence-t5-large already exists → ./models/sentence-t5-large
=== Model Download Complete (existing ones skipped) ===


In [None]:
from sentence_transformers import SentenceTransformer
import os, torch, glob

EXPECTED = [
    "all-MiniLM-L6-v2",
    "all-MiniLM-L12-v2",
    "e5-small-v2",
    "e5-base-v2",
    "e5-large-v2",
    "sentence-t5-base",
    "sentence-t5-large",
]

def list_existing(paths):
    return [p for p in paths if os.path.exists(p) and os.listdir(p)]

def set_model_candidates():

    if DATA == LOCAL:
        # 1) Local: ./models/<name>
        local_cands = [os.path.join("./models", n) for n in EXPECTED]
        candidates = list_existing(local_cands)
    else:
        # 2) Kaggle: /kaggle/input/<dataset>/[<bundle>/]<name>
        candidates = []
        if os.path.exists("/kaggle/input"):
            # Uploaded individually
            direct = [os.path.join("/kaggle/input", n) for n in EXPECTED]
            candidates += list_existing(direct)

            # Uploaded bundled
            bundles = glob.glob("/kaggle/input/*")
            for b in bundles:
                for n in EXPECTED:
                    candidates += list_existing([os.path.join(b, n)])
                    candidates += list_existing(glob.glob(os.path.join(b, "*", n)))

    return candidates

model_candidates = set_model_candidates()

print("=== Model Candidates ===")
for candidate in model_candidates:
    print("-", candidate)

=== Model Candidates ===
- ./models/all-MiniLM-L6-v2
- ./models/all-MiniLM-L12-v2
- ./models/e5-small-v2
- ./models/e5-base-v2
- ./models/e5-large-v2
- ./models/sentence-t5-base
- ./models/sentence-t5-large


In [None]:
import math, torch, time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss

model_names = []
times = []
mlp_val_losses = []
logistic_val_losses = []

for idx, model in enumerate(model_candidates):
    start_time = time.time()
    model, model_src = load_model(model_candidates, idx=idx, device=device)
    print("Load Complete. Using model from:", model_src)

    print("Encoding texts...")
    prompt_emb = encode_texts(model, train["prompt"])
    print("Prompt encoding complete.")
    a_emb = encode_texts(model, train["response_a"])
    print("Response A encoding complete.")
    b_emb = encode_texts(model, train["response_b"])
    print("Response B encoding complete.")

    prompt_emb_te = encode_texts(model, test["prompt"])
    a_emb_te = encode_texts(model, test["response_a"])
    b_emb_te = encode_texts(model, test["response_b"])
    print("Test set encoding complete.")

    # prompt_emb.shape, a_emb.shape, b_emb.shape
    print("prompt_emb.shape: ", prompt_emb.shape, "a_emb.shape: ", a_emb.shape, "b_emb.shape: ", b_emb.shape)

    X = build_feat(prompt_emb, a_emb, b_emb)
    X_te = build_feat(prompt_emb_te, a_emb_te, b_emb_te)
    print("X.shape: ", X.shape, "X_te.shape: ", X_te.shape)

    X_tr, X_va, y_tr, y_va = train_test_split(X, y, test_size=val_size, stratify=y, random_state=random_state)

    scaler = StandardScaler(with_mean=True, with_std=True)
    X_tr_sc = scaler.fit_transform(X_tr)
    X_va_sc = scaler.transform(X_va)

    ### Logistic Regression ###
    clf = LogisticRegression(max_iter=2000, random_state=random_state) 
    clf.fit(X_tr_sc, y_tr)
    va_pred_logistic = clf.predict_proba(X_va_sc)

    ### MLP Classifier ###
    mlp = MLPClassifier(
        hidden_layer_sizes=(512, 256),
        activation="relu",
        solver="adam",
        alpha=1e-4,          # L2
        batch_size=512,
        learning_rate_init=1e-3,
        max_iter=100,        
        early_stopping=True,
        n_iter_no_change=5,
        random_state=random_state,
        # verbose=False,
        verbose=True,
    )
    mlp.fit(X_tr_sc, y_tr)
    va_pred_mlp = mlp.predict_proba(X_va_sc)

    end_time = time.time()
    elapsed = end_time - start_time

    print("\nModel: ", model_src, " | time: ", elapsed, " seconds\n", 
          "MLP Valid LogLoss:", log_loss(y_va, va_pred_mlp), "\n", 
          "Logistic Valid LogLoss:", log_loss(y_va, va_pred_logistic), "\n", "")
    model_names.append(model_src)
    times.append(elapsed)
    mlp_val_losses.append(log_loss(y_va, va_pred_mlp))
    logistic_val_losses.append(log_loss(y_va, va_pred_logistic))

import pandas as pd
results = pd.DataFrame({
    "model": model_names,
    "time_sec": times,
    "mlp_val_logloss": mlp_val_losses,
    "logistic_val_logloss": logistic_val_losses,
})
print("\n=== Summary of Results ===")
print(results)


try: ./models/all-MiniLM-L6-v2
loaded model from: ./models/all-MiniLM-L6-v2
Load Complete. Using model from: ./models/all-MiniLM-L6-v2
prompt_emb.shape:  (57477, 384) a_emb.shape:  (57477, 384) b_emb.shape:  (57477, 384)
X.shape:  (57477, 3072) X_te.shape:  (3, 3072)
Iteration 1, loss = 1.13490196
Validation score: 0.426832
Iteration 2, loss = 0.82293057
Validation score: 0.397478
Iteration 3, loss = 0.54224561
Validation score: 0.407480
Iteration 4, loss = 0.27053691
Validation score: 0.398347
Iteration 5, loss = 0.12268374
Validation score: 0.412916
Iteration 6, loss = 0.06144031
Validation score: 0.413133
Iteration 7, loss = 0.03738650
Validation score: 0.420744
Validation score did not improve more than tol=0.000100 for 5 consecutive epochs. Stopping.
Model:  ./models/all-MiniLM-L6-v2  | time:  187.7004976272583  seconds
 MLP Valid LogLoss: 1.0716764008014537 
 Logistic Valid LogLoss: 1.0958757584014502 
 
try: ./models/all-MiniLM-L12-v2
loaded model from: ./models/all-MiniLM-L12-v

In [33]:
import time
import torch
import pandas as pd
import numpy as np
import os
import joblib # Using joblib as in Candidate 3
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

BEST_MODEL_NAME = "e5-small-v2"
BEST_MODEL_PATH = f"./models/{BEST_MODEL_NAME}"

# Use GPU for encoding
device = "cuda" if torch.cuda.is_available() else "cpu"

sbert_model, model_src = load_model([BEST_MODEL_PATH], idx=0, device=device)

# Encode ALL texts (Train + Test)
# (Assumes 'encode_texts' and 'build_feat' from cell 5399aff3 are in memory)
print("Encoding texts for FULL dataset...")
start_time = time.time()

# Encode FULL training data
prompt_emb_full = encode_texts(sbert_model, train["prompt"])
a_emb_full = encode_texts(sbert_model, train["response_a"])
b_emb_full = encode_texts(sbert_model, train["response_b"])

# Build FULL training features
# This is the 'X' for our final model
X_final_step2 = build_feat(prompt_emb_full, a_emb_full, b_emb_full)

# Encode test data
prompt_emb_te = encode_texts(sbert_model, test["prompt"])
a_emb_te = encode_texts(sbert_model, test["response_a"])
b_emb_te = encode_texts(sbert_model, test["response_b"])

# Build test features
# This is the 'X_te' for our final model
X_test_final_step2 = build_feat(prompt_emb_te, a_emb_te, b_emb_te)

# Clean up memory
del sbert_model, prompt_emb_full, a_emb_full, b_emb_full, prompt_emb_te, a_emb_te, b_emb_te
if device == 'cuda':
    torch.cuda.empty_cache()
print(f"Full feature extraction complete. Time: {time.time() - start_time:.2f}s")
print(f"Full Train Features: {X_final_step2.shape}")
print(f"Test Features: {X_test_final_step2.shape}")

print("Fitting StandardScaler on full training data...")
scaler_final_step2 = StandardScaler()
X_all_sc = scaler_final_step2.fit_transform(X_final_step2)
X_te_sc = scaler_final_step2.transform(X_test_final_step2)

# --- Train MLPClassifier on ALL data ---
print("Training final MLPClassifier on ALL data...")
start_time = time.time()

# Use parameters from your template
mlp_full = MLPClassifier(
    hidden_layer_sizes=(512, 256),
    activation="relu",
    solver="adam",
    alpha=1e-4,
    batch_size=512,
    learning_rate_init=1e-3,
    max_iter=100,        
    early_stopping=True,
    n_iter_no_change=7,  
    verbose=False,      
)

mlp_full.fit(X_all_sc, y)
print(f"Full training complete. Time: {time.time() - start_time:.2f}s")

# Predict on Test Data
pred = mlp_full.predict_proba(X_te_sc)
print("Prediction on test set complete.")

create_and_save_submission(
    predictions=pred, 
    filename=f"submission_step2_{BEST_MODEL_NAME}_mlp.csv",
    test_df=test,
    sample_df=sample
)

try: ./models/e5-small-v2


RuntimeError: Failed to load model. In Kaggle, you need to upload the model folder to Datasets and then link it via 'Add data'. Last error: name 'SentenceTransformer' is not defined

### Step 3. Model Extensions
- Explore bias-aware features (position bias, verbosity).
- Try lightweight fine-tuning (e.g., DeBERTa-small + LoRA).
- Experiment with calibration or ensembling.

### Candidate 1: DeBERTa + LoRA ###

In [14]:
### Candidate 1: DeBERTa + LoRA ###
import torch
import os
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    BitsAndBytesConfig # For QLoRA (4-bit quantization)
)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType
)
from datasets import Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from scipy.special import softmax

# MODEL_NAME = "all-MiniLM-L6-v2" # Can not use
# MODEL_PATH = f"./models/{MODEL_NAME}"
MODEL_NAME = f"microsoft/deberta-v3-base"

max_length = 512 # Max length(tokens) for DeBERTa
# max_length = 1024 # Max length(tokens) for DeBERTa

LORA_ADAPTER_DIR = f"./models/lora_adapter_{MODEL_NAME.split('/')[-1]}"

print(f"Using model: {MODEL_NAME}")
print(f"LoRA adapter will be saved to: {LORA_ADAPTER_DIR}")

train_df_lora = train.copy()
train_df_lora['labels'] = y

train_df, val_df = train_test_split(
    train_df_lora,
    test_size=val_size,

    stratify=train_df_lora['labels']
)
print(f"Training samples: {len(train_df)}, Validation samples: {len(val_df)}")

# Convert to Hugging Face 'Dataset' object
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load the tokenizer for our model
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    # This formats the input as: [CLS] prompt [SEP] A: response_a [SEP] B: response_b [SEP]
    # This is a robust way to present the three pieces of text to the model
    
    # Combine response_a and response_b into a single string
    response_pair = [f"A: {a} {tokenizer.sep_token} B: {b}" for a, b in zip(examples['response_a'], examples['response_b'])]
    
    # Tokenize, using prompt as the first sequence and the combined response as the second
    tokenized_inputs = tokenizer(
        examples['prompt'],
        response_pair, # This will be the second sequence
        max_length=max_length,
        truncation=True, # Need to consider whitch option is better
        padding=False # DataCollator will handle dynamic padding
    )
    
    # Add labels
    tokenized_inputs["labels"] = examples["labels"]
    return tokenized_inputs

print("Tokenizing datasets...")
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_df.columns.tolist())
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=val_df.columns.tolist())


# Data collator will dynamically pad batches to the max length *in that batch*
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print("Data preparation complete.")


Using model: microsoft/deberta-v3-base
LoRA adapter will be saved to: ./models/lora_adapter_deberta-v3-base
Training samples: 45981, Validation samples: 11496




Tokenizing datasets...


Map: 100%|██████████| 45981/45981 [00:23<00:00, 1936.88 examples/s]
Map: 100%|██████████| 11496/11496 [00:07<00:00, 1638.31 examples/s]

Data preparation complete.





In [15]:
# Fine-tuning with LoRA (NO Quantization)
# FINAL SOLUTION: Skip quantization entirely - it has compatibility issues with DeBERTa-v3
# Use gradient checkpointing for memory efficiency

# Check if LoRA adapter already exists
if os.path.exists(LORA_ADAPTER_DIR) and os.path.exists(os.path.join(LORA_ADAPTER_DIR, "adapter_config.json")):
    print(f"LoRA adapter already exists at: {LORA_ADAPTER_DIR}")
    print("Skipping training and loading existing adapter...")
    
    # Load base model
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=3,
    )
    
    # Load the saved LoRA adapter
    from peft import PeftModel
    peft_model = PeftModel.from_pretrained(model, LORA_ADAPTER_DIR)
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(LORA_ADAPTER_DIR)
    
    # Create a minimal trainer for prediction only
    training_args = TrainingArguments(
        output_dir=f"./{MODEL_NAME.split('/')[-1]}-checkpoints",
        per_device_eval_batch_size=8,
        fp16=False,
        report_to="none",
    )
    
    trainer = Trainer(
        model=peft_model,
        args=training_args,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    print("Successfully loaded existing LoRA adapter.")
    
else:
    print("No existing LoRA adapter found. Starting training from scratch...")
    
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=3,
    )

    # Enable gradient checkpointing to save memory
    model.gradient_checkpointing_enable()

    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=[
            "query_proj", 
            "key_proj", 
            "value_proj",
            "dense"
        ],
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.SEQ_CLS,
        inference_mode=False
    )
    # lora_config = LoraConfig(
    #     r=16,
    #     lora_alpha=32,
    #     target_modules=[
    #         "query_layer",  # <-- FIX: "query_proj" -> "query_layer"
    #         "key_layer",    # <-- FIX: "key_proj" -> "key_layer"
    #         "value_layer",  # <-- FIX: "value_proj" -> "value_layer"
    #         "dense"         # 'dense' is correct and targets FFN layers
    #     ],
    #     lora_dropout=0.1,
    #     bias="none",
    #     task_type=TaskType.SEQ_CLS # This is a sequence classification task
    # )
    # Apply LoRA to the model
    peft_model = get_peft_model(model, lora_config)
    peft_model.print_trainable_parameters()

    # --- Define Custom Compute Metrics ---
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        probs = softmax(logits, axis=1)
        
        eps = 1e-15
        probs = np.clip(probs, eps, 1 - eps)
        
        loss = log_loss(labels, probs)
        return {"log_loss": loss}

    # --- Define Training Arguments ---
    training_args = TrainingArguments(
        output_dir=f"./{MODEL_NAME.split('/')[-1]}-checkpoints",
        num_train_epochs=2,
        per_device_train_batch_size=4,  # Reduced for memory
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=2,  # Effective batch size = 4*2 = 8
        learning_rate=2e-5,
        weight_decay=0.01,

        eval_strategy="steps",
        eval_steps=200,
        save_strategy="steps",
        save_steps=200,

        load_best_model_at_end=True,
        metric_for_best_model="log_loss",
        greater_is_better=False,

        logging_steps=100,
        fp16=False,  # Disable fp16 due to gradient checkpointing conflict
        report_to="none",
        gradient_checkpointing=True,  # Enable gradient checkpointing
    )

    # --- Initialize Trainer ---
    trainer = Trainer(
        model=peft_model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # --- Start Training ---
    print("Starting fine-tuning...")
    trainer.train()

    # --- Save the final LoRA adapter ---
    # This saves only the small, trainable adapter weights
    os.makedirs(LORA_ADAPTER_DIR, exist_ok=True)
    trainer.model.save_pretrained(LORA_ADAPTER_DIR)

    # Also save the tokenizer
    tokenizer.save_pretrained(LORA_ADAPTER_DIR)

    print(f"Training complete. LoRA adapter saved to: {LORA_ADAPTER_DIR}")

# --- Predict on Test Data and Create Kaggle Submission ---
print("\n=== Generating Kaggle Submission ===")

# Make predictions on the tokenized test dataset
def preprocess_test_function(examples):
    # This formats the input as: [CLS] prompt [SEP] A: response_a [SEP] B: response_b [SEP]
    
    # Combine response_a and response_b into a single string
    response_pair = [f"A: {a} {tokenizer.sep_token} B: {b}" for a, b in zip(examples['response_a'], examples['response_b'])]
    
    # Tokenize, using prompt as the first sequence and the combined response as the second
    tokenized_inputs = tokenizer(
        examples['prompt'],
        response_pair, # This will be the second sequence
        max_length=max_length,
        truncation=True, # Need to consider whitch option is better
        padding=False # DataCollator will handle dynamic padding
    )
    return tokenized_inputs
print("Tokenizing test dataset...")
test_dataset = Dataset.from_pandas(test)
tokenized_test_dataset = test_dataset.map(preprocess_test_function, batched=True, remove_columns=test.columns.tolist())

print("Test data tokenization complete.")

predictions = trainer.predict(tokenized_test_dataset)

# Extract logits and convert to probabilities
test_logits = predictions.predictions
test_probs = softmax(test_logits, axis=1)

# Create submission file using the helper function
create_and_save_submission(
    predictions=test_probs,
    filename=f"submission_candidate1_deberta_lora_{MODEL_NAME.split('/')[-1]}.csv",
    test_df=test,
    sample_df=sample
)

print("Kaggle submission file created successfully!")


LoRA adapter already exists at: ./models/lora_adapter_deberta-v3-base
Skipping training and loading existing adapter...


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Successfully loaded existing LoRA adapter.

=== Generating Kaggle Submission ===
Tokenizing test dataset...


Map: 100%|██████████| 3/3 [00:00<00:00, 280.59 examples/s]


Test data tokenization complete.


Creating submission file: submission_candidate1_deberta_lora_deberta-v3-base.csv...
Successfully saved and verified: submission_candidate1_deberta_lora_deberta-v3-base.csv (Shape: (3, 4))
Kaggle submission file created successfully!


### Candidate 1: Calibration

In [16]:
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import log_loss
import numpy as np

print("\n=== Candidate 1 (DeBERTa + LoRA): Calibration ===")

# Get validation predictions from the existing trainer
val_predictions = trainer.predict(tokenized_val_dataset)
val_logits = val_predictions.predictions
val_probs_before = softmax(val_logits, axis=1)

# Get true labels from val_df
y_val_c1 = val_df['labels'].values

# Calculate log loss BEFORE calibration
logloss_before = log_loss(y_val_c1, val_probs_before)
print(f"Validation LogLoss BEFORE Calibration: {logloss_before:.6f}")

# Apply Isotonic Calibration per class
# We'll calibrate each class probability separately
print("Applying Isotonic calibration...")

calibrators = []
val_probs_calibrated = np.zeros_like(val_probs_before)

for class_idx in range(3):
    # Get probabilities for this class
    class_probs = val_probs_before[:, class_idx]
    
    # Create binary labels (1 if true class, 0 otherwise)
    y_binary = (y_val_c1 == class_idx).astype(int)
    
    # Fit isotonic regression
    iso = IsotonicRegression(out_of_bounds='clip')
    iso.fit(class_probs, y_binary)
    
    # Calibrate
    val_probs_calibrated[:, class_idx] = iso.predict(class_probs)
    
    calibrators.append(iso)

# Normalize probabilities to sum to 1
row_sums = val_probs_calibrated.sum(axis=1, keepdims=True)
val_probs_calibrated = val_probs_calibrated / np.clip(row_sums, 1e-15, None)

# Calculate log loss AFTER calibration
logloss_after = log_loss(y_val_c1, val_probs_calibrated)
print(f"Validation LogLoss AFTER Calibration: {logloss_after:.6f}")
print(f"Improvement: {logloss_before - logloss_after:.6f}")

# Now predict on test set with calibration
print("\nGenerating calibrated predictions for test set...")

# Get test predictions
test_predictions = trainer.predict(tokenized_test_dataset)
test_logits = test_predictions.predictions
test_probs_uncalibrated = softmax(test_logits, axis=1)

# Apply calibration
test_probs_calibrated = np.zeros_like(test_probs_uncalibrated)
for class_idx in range(3):
    class_probs = test_probs_uncalibrated[:, class_idx]
    test_probs_calibrated[:, class_idx] = calibrators[class_idx].predict(class_probs)

# Normalize
row_sums = test_probs_calibrated.sum(axis=1, keepdims=True)
test_probs_calibrated = test_probs_calibrated / np.clip(row_sums, 1e-15, None)

# Save calibrated submission
create_and_save_submission(
    predictions=test_probs_calibrated,
    filename=f"submission_candidate1_deberta_lora_CALIBRATED.csv",
    test_df=test,
    sample_df=sample
)

print(f"\n=== Candidate 1 Summary ===")
print(f"Before Calibration - Val LogLoss: {logloss_before:.6f}")
print(f"After Calibration  - Val LogLoss: {logloss_after:.6f}")
print(f"Final submission saved with calibration.")


=== Candidate 1 (DeBERTa + LoRA): Calibration ===


Validation LogLoss BEFORE Calibration: 1.098774
Applying Isotonic calibration...
Validation LogLoss AFTER Calibration: 1.073453
Improvement: 0.025321

Generating calibrated predictions for test set...


Creating submission file: submission_candidate1_deberta_lora_CALIBRATED.csv...
Successfully saved and verified: submission_candidate1_deberta_lora_CALIBRATED.csv (Shape: (3, 4))

=== Candidate 1 Summary ===
Before Calibration - Val LogLoss: 1.098774
After Calibration  - Val LogLoss: 1.073453
Final submission saved with calibration.


### Candidate 2: PLM + LightGBM(XGBoost)

In [17]:
### Candidate 2: PLM + LightGBM(XGBoost) ###
import lightgbm as lgb
import xgboost as xgb
import joblib # For saving models
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import time
import os
import torch

# Choose the model to run: "LGBM" or "XGBOOST"

MODEL_NAME = "e5-base-v2" 
BEST_EMBEDDING_MODEL_PATH = f"./models/{MODEL_NAME}"
print(f"Using embedding model: {BEST_EMBEDDING_MODEL_PATH}")

# Load the chosen embedding model
try:
    # We pass a list containing only our chosen model path
    sbert_model, model_src = load_model([BEST_EMBEDDING_MODEL_PATH], idx=0, device=device)
    # print(f"Successfully loaded model from: {model_src}")
except Exception as e:
    print(f"Failed to load model from {BEST_EMBEDDING_MODEL_PATH}. Error: {e}")

print("Encoding texts")
start_time = time.time()

# Encode training data
prompt_emb = encode_texts(sbert_model, train["prompt"])
print("Prompt encoding complete.")
a_emb = encode_texts(sbert_model, train["response_a"])
print("Response A encoding complete.")
b_emb = encode_texts(sbert_model, train["response_b"])
print("Response B encoding complete.")

# Build training features
X_c2 = build_feat(prompt_emb, a_emb, b_emb) # X for candidate 2

# Encode test data
print("Encoding test data...")
prompt_emb_te = encode_texts(sbert_model, test["prompt"])
a_emb_te = encode_texts(sbert_model, test["response_a"])
b_emb_te = encode_texts(sbert_model, test["response_b"])
print("Test encoding complete.")

# Build test features
X_test_c2 = build_feat(prompt_emb_te, a_emb_te, b_emb_te)

# Clean up model from memory
del sbert_model, prompt_emb, a_emb, b_emb, prompt_emb_te, a_emb_te, b_emb_te

print(f"Feature extraction complete. Time taken: {time.time() - start_time:.2f}s")
print(f"Train features shape (X_c2): {X_c2.shape}")
print(f"Test features shape (X_test_c2): {X_test_c2.shape}")

# Create Train/Validation Split
X_tr, X_va, y_tr, y_va = train_test_split(X_c2, y, test_size=val_size, stratify=y, random_state=random_state)
print(f"Data split into: Train {X_tr.shape}, Validation {X_va.shape}")


Using embedding model: ./models/e5-base-v2
try: ./models/e5-base-v2
loaded model from: ./models/e5-base-v2
Encoding texts
Prompt encoding complete.
Response A encoding complete.
Response B encoding complete.
Encoding test data...
Test encoding complete.
Feature extraction complete. Time taken: 1465.02s
Train features shape (X_c2): (57477, 6144)
Test features shape (X_test_c2): (3, 6144)
Data split into: Train (45981, 6144), Validation (11496, 6144)


In [14]:
# Path to save the trained GBM model
GBM_CHOICE = "LGBM" 
CANDIDATE_2_MODEL_SAVE_PATH = f"./models/candidate_2_{GBM_CHOICE.lower()}_{BEST_EMBEDDING_MODEL_PATH.split('/')[-1]}.pkl"
print(f"--- Candidate 2: {GBM_CHOICE} ---")
print(f"Model will be saved to: {CANDIDATE_2_MODEL_SAVE_PATH}")

start_time = time.time()

# This dictionary will hold the best validation logloss
val_logloss = {}
clf_c2 = None

if GBM_CHOICE == "LGBM":
    # --- 2.1. LightGBM ---
    clf_c2 = lgb.LGBMClassifier(
        objective='multiclass',
        metric='multi_logloss',
        num_class=3,
        n_estimators=1000,
        learning_rate=0.05,
        n_jobs=-1,
        random_state=random_state,
        # device='gpu' if device == 'cuda' else 'cpu'
        device=device
    )
    
    clf_c2.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric='multi_logloss',
        callbacks=[lgb.early_stopping(100, verbose=True)]
    )
    
    va_pred = clf_c2.predict_proba(X_va)
    val_logloss['LGBM'] = log_loss(y_va, va_pred)
    print(f"--- LGBM Validation LogLoss: {val_logloss['LGBM']:.6f} ---")

elif GBM_CHOICE == "XGBOOST":
    # --- 2.2. XGBoost ---
    clf_c2 = xgb.XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        num_class=3,
        n_estimators=1000,
        learning_rate=0.05,
        n_jobs=-1,
        random_state=random_state,
        device=device,
        early_stopping_rounds=100
    )
    
    clf_c2.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        verbose=True
    )
    
    va_pred = clf_c2.predict_proba(X_va)
    val_logloss['XGBOOST'] = log_loss(y_va, va_pred)
    print(f"--- XGBOOST Validation LogLoss: {val_logloss['XGBOOST']:.6f} ---")

else:
    print(f"Error: Unknown GBM_CHOICE '{GBM_CHOICE}'. Please set to 'LGBM' or 'XGBOOST'.")

print(f"Training complete. Time taken: {time.time() - start_time:.2f}s")

if clf_c2 is not None:
    # Use joblib for cross-compatibility between LGBM/XGB
    joblib.dump(clf_c2, CANDIDATE_2_MODEL_SAVE_PATH)
    print(f"Model saved to: {CANDIDATE_2_MODEL_SAVE_PATH}")

--- Candidate 2: LGBM ---
Model will be saved to: ./models/candidate_2_lgbm_e5-base-v2.pkl
[LightGBM] [Info] Total Bins 1566720
[LightGBM] [Info] Number of data points in the train set: 45981, number of used features: 6144
[LightGBM] [Info] Start training from score -1.052457
[LightGBM] [Info] Start training from score -1.073231
[LightGBM] [Info] Start training from score -1.174353
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[334]	valid_0's multi_logloss: 1.03351
--- LGBM Validation LogLoss: 1.033509 ---
Training complete. Time taken: 527.87s
Model saved to: ./models/candidate_2_lgbm_e5-base-v2.pkl




In [18]:
# Path to save the trained GBM model
GBM_CHOICE = "XGBOOST" 
CANDIDATE_2_MODEL_SAVE_PATH = f"./models/candidate_2_{GBM_CHOICE.lower()}_{BEST_EMBEDDING_MODEL_PATH.split('/')[-1]}.pkl"
print(f"--- Candidate 2: {GBM_CHOICE} ---")
print(f"Model will be saved to: {CANDIDATE_2_MODEL_SAVE_PATH}")

start_time = time.time()

# This dictionary will hold the best validation logloss
val_logloss = {}
clf_c2 = None

if GBM_CHOICE == "LGBM":
    # --- 2.1. LightGBM ---
    clf_c2 = lgb.LGBMClassifier(
        objective='multiclass',
        metric='multi_logloss',
        num_class=3,
        n_estimators=1000,
        learning_rate=0.05,
        n_jobs=-1,
        random_state=random_state,
        # device='gpu' if device == 'cuda' else 'cpu'
        device=device
    )
    
    clf_c2.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric='multi_logloss',
        callbacks=[lgb.early_stopping(100, verbose=True)]
    )
    
    va_pred = clf_c2.predict_proba(X_va)
    val_logloss['LGBM'] = log_loss(y_va, va_pred)
    print(f"--- LGBM Validation LogLoss: {val_logloss['LGBM']:.6f} ---")

elif GBM_CHOICE == "XGBOOST":
    # --- 2.2. XGBoost ---
    clf_c2 = xgb.XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        num_class=3,
        n_estimators=1000,
        learning_rate=0.05,
        n_jobs=-1,
        random_state=random_state,
        device=device,
        early_stopping_rounds=100
    )
    
    clf_c2.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        verbose=True
    )
    
    va_pred = clf_c2.predict_proba(X_va)
    val_logloss['XGBOOST'] = log_loss(y_va, va_pred)
    print(f"--- XGBOOST Validation LogLoss: {val_logloss['XGBOOST']:.6f} ---")

else:
    print(f"Error: Unknown GBM_CHOICE '{GBM_CHOICE}'. Please set to 'LGBM' or 'XGBOOST'.")

print(f"Training complete. Time taken: {time.time() - start_time:.2f}s")

if clf_c2 is not None:
    # Use joblib for cross-compatibility between LGBM/XGB
    joblib.dump(clf_c2, CANDIDATE_2_MODEL_SAVE_PATH)
    print(f"Model saved to: {CANDIDATE_2_MODEL_SAVE_PATH}")

--- Candidate 2: XGBOOST ---
Model will be saved to: ./models/candidate_2_xgboost_e5-base-v2.pkl
[0]	validation_0-mlogloss:1.09635
[1]	validation_0-mlogloss:1.09427
[2]	validation_0-mlogloss:1.09219
[3]	validation_0-mlogloss:1.09043
[4]	validation_0-mlogloss:1.08872
[5]	validation_0-mlogloss:1.08706
[6]	validation_0-mlogloss:1.08541
[7]	validation_0-mlogloss:1.08398
[8]	validation_0-mlogloss:1.08247
[9]	validation_0-mlogloss:1.08107
[10]	validation_0-mlogloss:1.07967
[11]	validation_0-mlogloss:1.07837
[12]	validation_0-mlogloss:1.07707
[13]	validation_0-mlogloss:1.07584
[14]	validation_0-mlogloss:1.07469
[15]	validation_0-mlogloss:1.07350
[16]	validation_0-mlogloss:1.07261
[17]	validation_0-mlogloss:1.07167
[18]	validation_0-mlogloss:1.07075
[19]	validation_0-mlogloss:1.06978
[20]	validation_0-mlogloss:1.06879
[21]	validation_0-mlogloss:1.06788
[22]	validation_0-mlogloss:1.06699
[23]	validation_0-mlogloss:1.06616
[24]	validation_0-mlogloss:1.06532
[25]	validation_0-mlogloss:1.06454
[26

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


--- XGBOOST Validation LogLoss: 1.035757 ---
Training complete. Time taken: 120.73s
Model saved to: ./models/candidate_2_xgboost_e5-base-v2.pkl


In [19]:
# --- ADDED CELL: Candidate 2 Full Train & Submission ---

print("\n--- Candidate 2 (XGBoost): Full Training & Submission ---")

try:
    best_iter = clf_c2.best_iteration_ or 1000
except AttributeError:
    print("Warning: 'best_iteration_' not found. Defaulting to 1000 estimators.")
    best_iter = 1000

if GBM_CHOICE == "LGBM":
    clf_c2_full = lgb.LGBMClassifier(
        objective='multiclass', metric='multi_logloss', num_class=3,
        n_estimators=best_iter, # Use best iteration from validation
        learning_rate=0.05,
        n_jobs=-1, random_state=random_state, device=device
    )
elif GBM_CHOICE == "XGBOOST":
    clf_c2_full = xgb.XGBClassifier(
    objective='multi:softprob', eval_metric='mlogloss', num_class=3,
    n_estimators=best_iter, # Use best iteration from validation
    learning_rate=0.05,
    n_jobs=-1, random_state=random_state, device=device
    )

print(f"Training full {GBM_CHOICE} model with {best_iter} estimators...")
# (Assumes X_c2 and y are the full datasets)
clf_c2_full.fit(X_c2, y) 
print("Full model trained.")

# Predict on test set
pred_c2_lgbm = clf_c2_full.predict_proba(X_test_c2) 

# Save submission
GBM_SAVE_PATH = f"./models/candidate_2_{GBM_CHOICE}_{MODEL_NAME}_full.pkl"
joblib.dump(clf_c2_full, GBM_SAVE_PATH)
print(f"Full {GBM_CHOICE} model saved to: {GBM_SAVE_PATH}")

create_and_save_submission(
    predictions=pred_c2_lgbm,
    filename=f"submission_candidate2_{GBM_CHOICE}_{MODEL_NAME}.csv",
    test_df=test, sample_df=sample
)


--- Candidate 2 (XGBoost): Full Training & Submission ---
Training full XGBOOST model with 1000 estimators...
Full model trained.
Full XGBOOST model saved to: ./models/candidate_2_XGBOOST_e5-base-v2_full.pkl
Creating submission file: submission_candidate2_XGBOOST_e5-base-v2.csv...
Successfully saved and verified: submission_candidate2_XGBOOST_e5-base-v2.csv (Shape: (3, 4))


Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060,0.241118,0.446204,0.312677
1,211333,0.288413,0.355221,0.356366
2,1233961,0.379745,0.376088,0.244166


### Candidate 2: Calibration

In [20]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss
import numpy as np

print("\n=== Candidate 2 (XGBoost): Calibration ===")

# Get validation predictions BEFORE calibration
# We need to recreate the train/val split from earlier
X_tr_c2, X_va_c2, y_tr_c2, y_va_c2 = train_test_split(
    X_c2, y, test_size=val_size, stratify=y, random_state=random_state
)

# Get predictions from the validation model (clf_c2 trained on X_tr, X_va split)
if 'clf_c2' in locals() and clf_c2 is not None:
    va_pred_before = clf_c2.predict_proba(X_va_c2)
    logloss_before = log_loss(y_va_c2, va_pred_before)
    print(f"Validation LogLoss BEFORE Calibration: {logloss_before:.6f}")
    
    # Apply calibration (using isotonic regression)
    print("Applying Isotonic calibration...")
    calibrated_model_c2 = CalibratedClassifierCV(
        clf_c2,
        method='isotonic',  # isotonic or sigmoid
        cv='prefit',  # Model is already fitted
        ensemble=False
    )
    
    # Fit calibration on validation set
    calibrated_model_c2.fit(X_va_c2, y_va_c2)
    
    # Get calibrated predictions on validation set
    va_pred_after = calibrated_model_c2.predict_proba(X_va_c2)
    logloss_after = log_loss(y_va_c2, va_pred_after)
    print(f"Validation LogLoss AFTER Calibration: {logloss_after:.6f}")
    print(f"Improvement: {logloss_before - logloss_after:.6f}")
    
    # Now retrain on full data and apply calibration
    print("\nRetraining on full data for final submission...")
    
    # Get best iteration
    try:
        best_iter = clf_c2.best_iteration_ or 1000
    except AttributeError:
        best_iter = 1000
    
    # Train on 80% of full data
    X_train_full, X_cal, y_train_full, y_cal = train_test_split(
        X_c2, y, test_size=0.2, stratify=y, random_state=random_state
    )
    
    if GBM_CHOICE == "LGBM":
        clf_c2_for_calib = lgb.LGBMClassifier(
            objective='multiclass', metric='multi_logloss', num_class=3,
            n_estimators=best_iter, learning_rate=0.05,
            n_jobs=-1, random_state=random_state, device=device
        )
    elif GBM_CHOICE == "XGBOOST":
        clf_c2_for_calib = xgb.XGBClassifier(
            objective='multi:softprob', eval_metric='mlogloss', num_class=3,
            n_estimators=best_iter, learning_rate=0.05,
            n_jobs=-1, random_state=random_state, device=device
        )
    
    clf_c2_for_calib.fit(X_train_full, y_train_full)
    
    # Apply calibration on the hold-out 20%
    calibrated_final_c2 = CalibratedClassifierCV(
        clf_c2_for_calib,
        method='isotonic',
        cv='prefit',
        ensemble=False
    )
    calibrated_final_c2.fit(X_cal, y_cal)
    
    # Predict on test set with calibration
    pred_c2_calibrated = calibrated_final_c2.predict_proba(X_test_c2)
    
    # Save calibrated submission
    create_and_save_submission(
        predictions=pred_c2_calibrated,
        filename=f"submission_candidate2_{GBM_CHOICE}_{MODEL_NAME}_CALIBRATED.csv",
        test_df=test,
        sample_df=sample
    )
    
    print(f"\n=== Candidate 2 Summary ===")
    print(f"Before Calibration - Val LogLoss: {logloss_before:.6f}")
    print(f"After Calibration  - Val LogLoss: {logloss_after:.6f}")
    print(f"Final submission saved with calibration.")
else:
    print("Error: clf_c2 model not found. Please run the training cell first.")


=== Candidate 2 (XGBoost): Calibration ===
Validation LogLoss BEFORE Calibration: 1.035757
Applying Isotonic calibration...




Validation LogLoss AFTER Calibration: 1.030071
Improvement: 0.005686

Retraining on full data for final submission...




Creating submission file: submission_candidate2_XGBOOST_e5-base-v2_CALIBRATED.csv...
Successfully saved and verified: submission_candidate2_XGBOOST_e5-base-v2_CALIBRATED.csv (Shape: (3, 4))

=== Candidate 2 Summary ===
Before Calibration - Val LogLoss: 1.035757
After Calibration  - Val LogLoss: 1.030071
Final submission saved with calibration.


### Candidate 3: All Features + MLP

In [6]:
### Candidate 3: All Features + MLP ###
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import joblib
import os

# Define the "Strong" lexical feature builder
def stats_strong(s):
    """Calculates a comprehensive set of lexical statistics."""
    if not isinstance(s, str): s = ""
    toks = s.split()
    return {
        "len_char": len(s),
        "len_tok": len(toks),
        "num_sent": sum(s.count(x) for x in [".", "!", "?"]),
        "num_code": s.count("`"),
        "num_list": s.count("- ") + s.count("* "),
        "num_upper": sum(ch.isupper() for ch in s),
        "num_punct": sum(ch in ",;:()" for ch in s),
        "avg_tok_len": (sum(len(t) for t in toks) / len(toks)) if toks else 0.0,
    }

def build_strong_lexical_features(df):
    """Builds the full set of lexical and bias features."""
    rows = []
    cols = ["prompt", "response_a", "response_b"]
    
    for p, a, b in zip(df[cols[0]], df[cols[1]], df[cols[2]]):
        ps, as_, bs = stats_strong(p), stats_strong(a), stats_strong(b)
        rows.append({
            "p_len_char": ps["len_char"], "p_len_tok": ps["len_tok"], "p_num_sent": ps["num_sent"],
            "a_len_char": as_["len_char"], "a_len_tok": as_["len_tok"], "a_num_sent": as_["num_sent"],
            "a_num_code": as_["num_code"], "a_num_list": as_["num_list"], "a_num_upper": as_["num_upper"],
            "a_num_punct": as_["num_punct"], "a_avg_tok_len": as_["avg_tok_len"],
            "b_len_char": bs["len_char"], "b_len_tok": bs["len_tok"], "b_num_sent": bs["num_sent"],
            "b_num_code": bs["num_code"], "b_num_list": bs["num_list"], "b_num_upper": bs["num_upper"],
            "b_num_punct": bs["num_punct"], "b_avg_tok_len": bs["avg_tok_len"],
            # A-B Differences
            "d_len_char": as_["len_char"] - bs["len_char"],
            "d_len_tok": as_["len_tok"] - bs["len_tok"],
            "d_num_sent": as_["num_sent"] - bs["num_sent"],
            "d_num_code": as_["num_code"] - bs["num_code"],
            "d_num_list": as_["num_list"] - bs["num_list"],
            "d_num_upper": as_["num_upper"] - bs["num_upper"],
            "d_num_punct": as_["num_punct"] - bs["num_punct"],
            "d_avg_tok_len": as_["avg_tok_len"] - bs["avg_tok_len"],
            # Ratios
            "r_len_char": (as_["len_char"] + 1) / (bs["len_char"] + 1),
            "r_len_tok": (as_["len_tok"] + 1) / (bs["len_tok"] + 1),
            "r_num_sent": (as_["num_sent"] + 1) / (bs["num_sent"] + 1),
        })
    return pd.DataFrame(rows)

print("--- Candidate 3: All Features + MLP ---")
print("Building strong lexical features...")

# Generate Lexical Features
X_lex_strong = build_strong_lexical_features(train)
X_test_lex_strong = build_strong_lexical_features(test)

print(f"Strong lexical features shape: {X_lex_strong.shape}")

--- Candidate 3: All Features + MLP ---
Building strong lexical features...
Strong lexical features shape: (57477, 30)


In [7]:
import joblib # For saving models
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import time
import os
import torch

MODEL_NAME = "e5-base-v2" 
BEST_EMBEDDING_MODEL_PATH = f"./models/{MODEL_NAME}"
print(f"Using embedding model: {BEST_EMBEDDING_MODEL_PATH}")

# Load the chosen embedding model
try:
    # We pass a list containing only our chosen model path
    sbert_model, model_src = load_model([BEST_EMBEDDING_MODEL_PATH], idx=0, device=device)
    # print(f"Successfully loaded model from: {model_src}")
except Exception as e:
    print(f"Failed to load model from {BEST_EMBEDDING_MODEL_PATH}. Error: {e}")

print("Encoding texts")
start_time = time.time()

# Encode training data
prompt_emb = encode_texts(sbert_model, train["prompt"])
print("Prompt encoding complete.")
a_emb = encode_texts(sbert_model, train["response_a"])
print("Response A encoding complete.")
b_emb = encode_texts(sbert_model, train["response_b"])
print("Response B encoding complete.")

# Build training features
X_c3 = build_feat(prompt_emb, a_emb, b_emb) # X for candidate 3

# Encode test data
print("Encoding test data...")
prompt_emb_te = encode_texts(sbert_model, test["prompt"])
a_emb_te = encode_texts(sbert_model, test["response_a"])
b_emb_te = encode_texts(sbert_model, test["response_b"])
print("Encoding complete for test data.")

# Build test features
X_test_c3 = build_feat(prompt_emb_te, a_emb_te, b_emb_te)

# Clean up model from memory
del sbert_model, prompt_emb, a_emb, b_emb, prompt_emb_te, a_emb_te, b_emb_te

print(f"Feature extraction complete. Time taken: {time.time() - start_time:.2f}s")
print(f"Train features shape (X_c3): {X_c3.shape}")
print(f"Test features shape (X_test_c3): {X_test_c3.shape}")

# Create Train/Validation Split
X_tr, X_va, y_tr, y_va = train_test_split(X_c3, y, test_size=val_size, stratify=y)
print(f"Data split into: Train {X_tr.shape}, Validation {X_va.shape}")

Using embedding model: ./models/e5-base-v2
try: ./models/e5-base-v2
loaded model from: ./models/e5-base-v2
Encoding texts
Prompt encoding complete.
Response A encoding complete.
Response B encoding complete.
Encoding test data...
Encoding complete for test data.
Feature extraction complete. Time taken: 1461.08s
Train features shape (X_c3): (57477, 6144)
Test features shape (X_test_c3): (3, 6144)
Data split into: Train (45981, 6144), Validation (11496, 6144)


In [8]:

# Handle NaNs/Infs for safety
X_lex_strong = X_lex_strong.fillna(0).replace([np.inf, -np.inf], 0)
X_test_lex_strong = X_test_lex_strong.fillna(0).replace([np.inf, -np.inf], 0)

# Convert embedding features (numpy) to DataFrame for safe processing
X_c3_safe = pd.DataFrame(X_c3).fillna(0).replace([np.inf, -np.inf], 0).values
X_test_c3_safe = pd.DataFrame(X_test_c3).fillna(0).replace([np.inf, -np.inf], 0).values

print("Combining lexical and embedding features...")
X_c3 = np.hstack([X_lex_strong, X_c3_safe])
X_test_c3 = np.hstack([X_test_lex_strong, X_test_c3_safe])
print(f"Combined train features shape (X_c3): {X_c3.shape}")
print(f"Combined test features shape (X_test_c3): {X_test_c3.shape}")

X_tr, X_va, y_tr, y_va = train_test_split(X_c3, y, test_size=val_size, stratify=y, random_state=random_state)

print("Scaling features...")
scaler_c3 = StandardScaler()
X_tr_sc = scaler_c3.fit_transform(X_tr)
X_va_sc = scaler_c3.transform(X_va)

# Train MLP Classifier
print("Training MLPClassifier...")
start_time = time.time()

clf_c3 = MLPClassifier(
    hidden_layer_sizes=(512, 256),
    activation="relu",
    solver="adam",
    alpha=1e-4,          # L2 regularization
    batch_size=512,
    learning_rate_init=1e-3,
    max_iter=100,        
    early_stopping=True,
    n_iter_no_change=5,
    random_state=random_state,
    verbose=True
)

clf_c3.fit(X_tr_sc, y_tr)

va_pred = clf_c3.predict_proba(X_va_sc)
val_logloss = log_loss(y_va, va_pred)

print(f"Training complete. Time taken: {time.time() - start_time:.2f}s")
print(f"--- MLP (All Features) Validation LogLoss: {val_logloss:.6f} ---")

CANDIDATE_3_MODEL_SAVE_PATH = "./models/candidate_3_mlp.pkl"
CANDIDATE_3_SCALER_SAVE_PATH = "./models/candidate_3_scaler.pkl"

joblib.dump(clf_c3, CANDIDATE_3_MODEL_SAVE_PATH)
joblib.dump(scaler_c3, CANDIDATE_3_SCALER_SAVE_PATH)

print(f"MLP model saved to: {CANDIDATE_3_MODEL_SAVE_PATH}")
print(f"Scaler saved to: {CANDIDATE_3_SCALER_SAVE_PATH}")


Combining lexical and embedding features...
Combined train features shape (X_c3): (57477, 6174)
Combined test features shape (X_test_c3): (3, 6174)
Scaling features...
Training MLPClassifier...
Iteration 1, loss = 1.12600698
Validation score: 0.463796
Iteration 2, loss = 0.82086238
Validation score: 0.447271
Iteration 3, loss = 0.53996426
Validation score: 0.430093
Iteration 4, loss = 0.26805010
Validation score: 0.429441
Iteration 5, loss = 0.12704382
Validation score: 0.448141
Iteration 6, loss = 0.06509657
Validation score: 0.437486
Iteration 7, loss = 0.04185474
Validation score: 0.436834
Validation score did not improve more than tol=0.000100 for 5 consecutive epochs. Stopping.
Training complete. Time taken: 56.44s
--- MLP (All Features) Validation LogLoss: 1.048909 ---
MLP model saved to: ./models/candidate_3_mlp.pkl
Scaler saved to: ./models/candidate_3_scaler.pkl


In [9]:
# --- ADDED CELL: Candidate 3 (MLP) Full Train & Submission ---
# This cell assumes 'X_c3', 'X_test_c3', 'y', 'random_state', 'test', 'sample' exist.

print("\n--- Candidate 3 (MLP): Full Training & Submission ---")

# 1. Scale ALL data
print("Scaling full dataset for submission...")
scaler_c3_full = StandardScaler()
X_all_sc_c3 = scaler_c3_full.fit_transform(X_c3)
X_test_sc_c3 = scaler_c3_full.transform(X_test_c3)

# 2. Train new model on ALL data
print("Training final MLP on ALL data...")
start_time_c3_full = time.time()
clf_c3_full = MLPClassifier(
    hidden_layer_sizes=(512, 256), activation="relu", solver="adam",
    alpha=1e-4, batch_size=512, learning_rate_init=1e-3,
    max_iter=100, 
    early_stopping=True, n_iter_no_change=5, # Keep early stopping
    random_state=random_state, verbose=True
)
clf_c3_full.fit(X_all_sc_c3, y) # Train on full X and y
print(f"Full training complete. Time: {time.time() - start_time_c3_full:.2f}s")

# 3. Save model and scaler
CANDIDATE_3_MODEL_SAVE_PATH = "./models/candidate_3_mlp_full.pkl"
CANDIDATE_3_SCALER_SAVE_PATH = "./models/candidate_3_scaler_full.pkl"
joblib.dump(clf_c3_full, CANDIDATE_3_MODEL_SAVE_PATH)
joblib.dump(scaler_c3_full, CANDIDATE_3_SCALER_SAVE_PATH)
print(f"MLP (full) model saved to: {CANDIDATE_3_MODEL_SAVE_PATH}")
print(f"Scaler (full) saved to: {CANDIDATE_3_SCALER_SAVE_PATH}")

# 4. Predict and Save Submission
pred_c3_full = clf_c3_full.predict_proba(X_test_sc_c3)
create_and_save_submission(
    predictions=pred_c3_full,
    filename=f"submission_candidate3_MLP_AllFeatures.csv",
    test_df=test,
    sample_df=sample
)



--- Candidate 3 (MLP): Full Training & Submission ---
Scaling full dataset for submission...
Training final MLP on ALL data...
Iteration 1, loss = 1.10194019
Validation score: 0.470424
Iteration 2, loss = 0.87663369
Validation score: 0.437717
Iteration 3, loss = 0.68406746
Validation score: 0.435978
Iteration 4, loss = 0.45052073
Validation score: 0.420842
Iteration 5, loss = 0.26795151
Validation score: 0.439631
Iteration 6, loss = 0.13355633
Validation score: 0.449374
Iteration 7, loss = 0.07360602
Validation score: 0.446938
Validation score did not improve more than tol=0.000100 for 5 consecutive epochs. Stopping.
Full training complete. Time: 55.82s
MLP (full) model saved to: ./models/candidate_3_mlp_full.pkl
Scaler (full) saved to: ./models/candidate_3_scaler_full.pkl
Creating submission file: submission_candidate3_MLP_AllFeatures.csv...
Successfully saved and verified: submission_candidate3_MLP_AllFeatures.csv (Shape: (3, 4))


Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060,0.205456,0.429275,0.365268
1,211333,0.431486,0.20763,0.360884
2,1233961,0.226477,0.352537,0.420986


### Candidate 3: Calibration

In [10]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss
import numpy as np

print("\n=== Candidate 3 (MLP All Features): Calibration ===")

# Get validation predictions BEFORE calibration
# We need to recreate the train/val split from earlier
X_tr_c3, X_va_c3, y_tr_c3, y_va_c3 = train_test_split(
    X_c3, y, test_size=val_size, stratify=y, random_state=random_state
)

# Scale validation data
scaler_c3_val = StandardScaler()
X_tr_sc_c3 = scaler_c3_val.fit_transform(X_tr_c3)
X_va_sc_c3 = scaler_c3_val.transform(X_va_c3)

# Get predictions from the validation model (clf_c3 trained on X_tr, X_va split)
if 'clf_c3' in locals() and clf_c3 is not None:
    va_pred_before = clf_c3.predict_proba(X_va_sc_c3)
    logloss_before = log_loss(y_va_c3, va_pred_before)
    print(f"Validation LogLoss BEFORE Calibration: {logloss_before:.6f}")
    
    # Apply calibration (using isotonic regression)
    print("Applying Isotonic calibration...")
    calibrated_model_c3 = CalibratedClassifierCV(
        clf_c3,
        method='isotonic',  # isotonic or sigmoid
        cv='prefit',  # Model is already fitted
        ensemble=False
    )
    
    # Fit calibration on validation set
    calibrated_model_c3.fit(X_va_sc_c3, y_va_c3)
    
    # Get calibrated predictions on validation set
    va_pred_after = calibrated_model_c3.predict_proba(X_va_sc_c3)
    logloss_after = log_loss(y_va_c3, va_pred_after)
    print(f"Validation LogLoss AFTER Calibration: {logloss_after:.6f}")
    print(f"Improvement: {logloss_before - logloss_after:.6f}")
    
    # Now retrain on full data and apply calibration
    print("\nRetraining on full data for final submission...")
    
    # Train on 80% of full data
    X_train_full, X_cal, y_train_full, y_cal = train_test_split(
        X_c3, y, test_size=0.2, stratify=y, random_state=random_state
    )
    
    # Scale
    scaler_final_c3 = StandardScaler()
    X_train_full_sc = scaler_final_c3.fit_transform(X_train_full)
    X_cal_sc = scaler_final_c3.transform(X_cal)
    X_test_final_sc = scaler_final_c3.transform(X_test_c3)
    
    # Train MLP
    clf_c3_for_calib = MLPClassifier(
        hidden_layer_sizes=(512, 256), activation="relu", solver="adam",
        alpha=1e-4, batch_size=512, learning_rate_init=1e-3,
        max_iter=100, early_stopping=True, n_iter_no_change=5,
        random_state=random_state, verbose=False
    )
    clf_c3_for_calib.fit(X_train_full_sc, y_train_full)
    
    # Apply calibration on the hold-out 20%
    calibrated_final_c3 = CalibratedClassifierCV(
        clf_c3_for_calib,
        method='isotonic',
        cv='prefit',
        ensemble=False
    )
    calibrated_final_c3.fit(X_cal_sc, y_cal)
    
    # Predict on test set with calibration
    pred_c3_calibrated = calibrated_final_c3.predict_proba(X_test_final_sc)
    
    # Save calibrated submission
    create_and_save_submission(
        predictions=pred_c3_calibrated,
        filename=f"submission_candidate3_MLP_AllFeatures_CALIBRATED.csv",
        test_df=test,
        sample_df=sample
    )
    
    print(f"\n=== Candidate 3 Summary ===")
    print(f"Before Calibration - Val LogLoss: {logloss_before:.6f}")
    print(f"After Calibration  - Val LogLoss: {logloss_after:.6f}")
    print(f"Final submission saved with calibration.")
else:
    print("Error: clf_c3 model not found. Please run the training cell first.")


=== Candidate 3 (MLP All Features): Calibration ===
Validation LogLoss BEFORE Calibration: 1.048909
Applying Isotonic calibration...




Validation LogLoss AFTER Calibration: 1.036883
Improvement: 0.012026

Retraining on full data for final submission...
Creating submission file: submission_candidate3_MLP_AllFeatures_CALIBRATED.csv...
Successfully saved and verified: submission_candidate3_MLP_AllFeatures_CALIBRATED.csv (Shape: (3, 4))

=== Candidate 3 Summary ===
Before Calibration - Val LogLoss: 1.048909
After Calibration  - Val LogLoss: 1.036883
Final submission saved with calibration.




### Final Step. Final Model and Results

### Final Ensemble: Advanced Strategies

이 섹션에서는 Candidate 1, 2, 3의 calibrated predictions를 최적으로 결합합니다.

**구현할 앙상블 전략:**
1. **Simple Average** - 기준선
2. **Weighted Average** - Validation 성능 기반 가중치
3. **Stacked Generalization** - Meta-learner (Logistic Regression)
4. **Optimal Weights (Grid Search)** - 최적 가중치 탐색
5. **Rank Averaging** - 순위 기반 앙상블

각 전략의 Validation Log Loss를 비교하여 최종 제출용 모델을 선택합니다.

In [23]:
# === Section 1. Imports and Validation Predictions (C1/C2/C3) ===
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.optimize import minimize
import glob, os

print("="*80)
print("ADVANCED ENSEMBLE: Loading Calibrated Predictions")
print("="*80)

# Candidate 1 (DeBERTa + LoRA)
print("\n1) Candidate 1 (DeBERTa) validation predictions")
if 'calibrated_model_c1' in locals() and calibrated_model_c1 is not None:
    val_indices_c1 = np.arange(len(tokenized_val_dataset))
    val_pred_c1 = calibrated_model_c1.predict_proba(val_indices_c1.reshape(-1, 1))
    y_val_ensemble = val_df['labels'].values
    print(f"   OK shape: {val_pred_c1.shape}")
    print(f"   Val LogLoss: {log_loss(y_val_ensemble, val_pred_c1):.6f}")
else:
    print("   Missing Candidate 1. Run the calibration cell.")
    val_pred_c1 = None

# Candidate 2 (XGBoost/LightGBM)
print("\n2) Candidate 2 (XGBoost/LGBM) validation predictions")
if 'X_c2' in locals() and 'calibrated_model_c2' in locals() and calibrated_model_c2 is not None:
    X_tr_c2_ens, X_va_c2_ens, y_tr_c2_ens, y_va_c2_ens = train_test_split(
        X_c2, y, test_size=val_size, stratify=y, random_state=random_state
    )
    val_pred_c2 = calibrated_model_c2.predict_proba(X_va_c2_ens)
    print(f"   OK shape: {val_pred_c2.shape}")
    print(f"   Val LogLoss: {log_loss(y_va_c2_ens, val_pred_c2):.6f}")
    if val_pred_c1 is not None and len(y_va_c2_ens) != len(y_val_ensemble):
        print("   Warning: validation set sizes do not match.")
else:
    print("   Missing Candidate 2. Run the calibration cell.")
    val_pred_c2 = None

# Candidate 3 (MLP + All Features)
print("\n3) Candidate 3 (MLP) validation predictions")
if 'X_c3' in locals() and 'calibrated_model_c3' in locals() and calibrated_model_c3 is not None:
    X_tr_c3_ens, X_va_c3_ens, y_tr_c3_ens, y_va_c3_ens = train_test_split(
        X_c3, y, test_size=val_size, stratify=y, random_state=random_state
    )
    scaler_c3_ens = StandardScaler()
    X_tr_sc_c3_ens = scaler_c3_ens.fit_transform(X_tr_c3_ens)
    X_va_sc_c3_ens = scaler_c3_ens.transform(X_va_c3_ens)
    val_pred_c3 = calibrated_model_c3.predict_proba(X_va_sc_c3_ens)
    print(f"   OK shape: {val_pred_c3.shape}")
    print(f"   Val LogLoss: {log_loss(y_va_c3_ens, val_pred_c3):.6f}")
else:
    print("   Missing Candidate 3. Run the calibration cell.")
    val_pred_c3 = None

# Individual model losses
individual_val_losses = {}
if val_pred_c1 is not None:
    individual_val_losses['C1_DeBERTa'] = log_loss(y_val_ensemble, val_pred_c1)
if val_pred_c2 is not None:
    individual_val_losses['C2_XGBoost'] = log_loss(y_va_c2_ens, val_pred_c2)
if val_pred_c3 is not None:
    individual_val_losses['C3_MLP'] = log_loss(y_va_c3_ens, val_pred_c3)

print("\n" + "="*80)
print("Individual Model Performance Summary")
for name, loss in individual_val_losses.items():
    print(f"   {name}: {loss:.6f}")
print("="*80)


ADVANCED ENSEMBLE: Loading Calibrated Predictions

1) Candidate 1 (DeBERTa) validation predictions
   Missing Candidate 1. Run the calibration cell.

2) Candidate 2 (XGBoost/LGBM) validation predictions
   OK shape: (11496, 3)
   Val LogLoss: 1.030071

3) Candidate 3 (MLP) validation predictions
   OK shape: (11496, 3)
   Val LogLoss: 1.036883

Individual Model Performance Summary
   C2_XGBoost: 1.030071
   C3_MLP: 1.036883


In [24]:
# === Section 2. Ensemble Strategy Search (simple, weighted, optimal, stacked, rank) ===
print("\n" + "="*80)
print("TESTING ENSEMBLE STRATEGIES")
print("="*80)

ensemble_results = {}

if val_pred_c1 is not None and val_pred_c2 is not None and val_pred_c3 is not None:
    print("\nEnsuring a common validation set...")
    if len(val_pred_c1) == len(val_pred_c2) == len(val_pred_c3):
        y_val_ref = y_val_ensemble
        print("   All validation sizes match.")
    else:
        min_size = min(len(val_pred_c1), len(val_pred_c2), len(val_pred_c3))
        val_pred_c1 = val_pred_c1[:min_size]
        val_pred_c2 = val_pred_c2[:min_size]
        val_pred_c3 = val_pred_c3[:min_size]
        y_val_ref = y_val_ensemble[:min_size]
        print(f"   Truncated to {min_size} samples.")

    val_preds_stacked = np.stack([val_pred_c1, val_pred_c2, val_pred_c3], axis=0)
    num_classes = val_pred_c1.shape[1]

    # Strategy 1: Simple average
    print("\nStrategy 1: Simple Average")
    ensemble_simple = np.mean(val_preds_stacked, axis=0)
    loss_simple = log_loss(y_val_ref, ensemble_simple)
    ensemble_results['Simple_Average'] = loss_simple
    print(f"   LogLoss: {loss_simple:.6f}")

    # Strategy 2: Weighted average by inverse log-loss
    print("\nStrategy 2: Weighted Average (inverse log loss)")
    losses = np.array([
        log_loss(y_val_ref, val_pred_c1),
        log_loss(y_val_ref, val_pred_c2),
        log_loss(y_val_ref, val_pred_c3),
    ])
    inv_losses = 1.0 / losses
    weights_perf = inv_losses / inv_losses.sum()
    ensemble_weighted = (
        weights_perf[0] * val_pred_c1 +
        weights_perf[1] * val_pred_c2 +
        weights_perf[2] * val_pred_c3
    )
    loss_weighted = log_loss(y_val_ref, ensemble_weighted)
    ensemble_results['Weighted_Average'] = loss_weighted
    print(f"   Weights: {weights_perf.round(3).tolist()}")
    print(f"   LogLoss: {loss_weighted:.6f}")

    # Strategy 3: Optimal weights via SLSQP
    print("\nStrategy 3: Optimal Weights (SLSQP)")
    def ensemble_loss(weights, preds, y_true):
        w = np.array(weights, dtype=float)
        w = w / w.sum()
        ens = sum(w_i * p_i for w_i, p_i in zip(w, preds))
        return log_loss(y_true, ens)

    result = minimize(
        lambda w: ensemble_loss(w, [val_pred_c1, val_pred_c2, val_pred_c3], y_val_ref),
        x0=[1.0, 1.0, 1.0],
        method='SLSQP',
        bounds=[(0.01, 1.0)] * 3
    )
    optimal_weights = result.x / result.x.sum()
    ensemble_optimal = (
        optimal_weights[0] * val_pred_c1 +
        optimal_weights[1] * val_pred_c2 +
        optimal_weights[2] * val_pred_c3
    )
    loss_optimal = log_loss(y_val_ref, ensemble_optimal)
    ensemble_results['Optimal_Weights'] = loss_optimal
    print(f"   Weights: {optimal_weights.round(3).tolist()}")
    print(f"   LogLoss: {loss_optimal:.6f}")

    # Strategy 4: Stacked generalization
    print("\nStrategy 4: Stacked Generalization (Logistic Regression)")
    meta_features = np.hstack([val_pred_c1, val_pred_c2, val_pred_c3])
    X_meta_tr, X_meta_va, y_meta_tr, y_meta_va = train_test_split(
        meta_features, y_val_ref, test_size=0.3, random_state=random_state, stratify=y_val_ref
    )
    meta_learner = LogisticRegression(max_iter=1000, multi_class='multinomial', random_state=random_state)
    meta_learner.fit(X_meta_tr, y_meta_tr)
    ensemble_stacked = meta_learner.predict_proba(X_meta_va)
    loss_stacked = log_loss(y_meta_va, ensemble_stacked)
    ensemble_results['Stacked_Generalization'] = loss_stacked
    print(f"   LogLoss: {loss_stacked:.6f}")

    # Strategy 5: Rank averaging (class-count agnostic)
    print("\nStrategy 5: Rank Averaging")
    def probs_to_ranks(probs):
        ranks = np.zeros_like(probs, dtype=int)
        for i in range(probs.shape[0]):
            ranks[i] = np.argsort(np.argsort(-probs[i]))
        return ranks

    ranks_c1 = probs_to_ranks(val_pred_c1)
    ranks_c2 = probs_to_ranks(val_pred_c2)
    ranks_c3 = probs_to_ranks(val_pred_c3)
    avg_ranks = (ranks_c1 + ranks_c2 + ranks_c3) / 3.0

    ensemble_rank = np.zeros_like(val_pred_c1, dtype=float)
    for i in range(avg_ranks.shape[0]):
        scores = float(num_classes) - avg_ranks[i]  # lower rank -> higher score
        scores = np.exp(scores)
        ensemble_rank[i] = scores / scores.sum()
    loss_rank = log_loss(y_val_ref, ensemble_rank)
    ensemble_results['Rank_Averaging'] = loss_rank
    print(f"   LogLoss: {loss_rank:.6f}")

    # Summary and pick best
    print("\n" + "="*80)
    print("ENSEMBLE STRATEGY COMPARISON")
    print("="*80)
    results_df = pd.DataFrame({
        'Strategy': list(ensemble_results.keys()),
        'Validation_LogLoss': list(ensemble_results.values())
    }).sort_values('Validation_LogLoss')
    print(results_df.to_string(index=False))

    best_strategy = results_df.iloc[0]['Strategy']
    best_loss = results_df.iloc[0]['Validation_LogLoss']
    print("\n" + "="*80)
    print(f"BEST ENSEMBLE STRATEGY: {best_strategy}")
    print(f"Validation LogLoss: {best_loss:.6f}")

    if individual_val_losses:
        best_individual = min(individual_val_losses.values())
        improvement = best_individual - best_loss
        print(f"\nBest Individual: {best_individual:.6f}")
        print(f"Improvement: {improvement:.6f} ({improvement / best_individual * 100:.2f}%)")
    print("="*80)

    # Save decision
    if best_strategy == 'Optimal_Weights':
        final_ensemble_weights, final_ensemble_method = optimal_weights, 'optimal'
    elif best_strategy == 'Weighted_Average':
        final_ensemble_weights, final_ensemble_method = weights_perf, 'weighted'
    elif best_strategy == 'Stacked_Generalization':
        final_ensemble_weights, final_ensemble_method = None, 'stacked'
    elif best_strategy == 'Rank_Averaging':
        final_ensemble_weights, final_ensemble_method = None, 'rank'
    else:
        final_ensemble_weights, final_ensemble_method = np.array([1/3, 1/3, 1/3]), 'simple'
else:
    print("\nERROR: Not all candidate models are available. Run all calibration cells first.")
    final_ensemble_method = None



TESTING ENSEMBLE STRATEGIES

ERROR: Not all candidate models are available. Run all calibration cells first.


In [None]:
# === Section 3. Test-Time Ensembling and Final Summary ===
print("\n" + "="*80)
print("GENERATING FINAL ENSEMBLE PREDICTIONS")
print("="*80)

if final_ensemble_method is not None:
    print("\nLoading test predictions...")
    # C1
    if 'test_probs_calibrated' in locals():
        test_pred_c1 = test_probs_calibrated
        print(f"   C1 shape: {test_pred_c1.shape}")
    else:
        print("   C1 calibrated not found. Using uncalibrated if present.")
        test_pred_c1 = test_probs if 'test_probs' in locals() else None
    # C2
    if 'pred_c2_calibrated' in locals():
        test_pred_c2 = pred_c2_calibrated
        print(f"   C2 shape: {test_pred_c2.shape}")
    else:
        print("   C2 calibrated not found. Generating if possible.")
        if 'calibrated_final_c2' in locals() and 'X_test_c2' in locals():
            test_pred_c2 = calibrated_final_c2.predict_proba(X_test_c2)
            print(f"   Generated C2: {test_pred_c2.shape}")
        else:
            test_pred_c2 = None
    # C3
    if 'pred_c3_calibrated' in locals():
        test_pred_c3 = pred_c3_calibrated
        print(f"   C3 shape: {test_pred_c3.shape}")
    else:
        print("   C3 calibrated not found. Generating if possible.")
        if 'calibrated_final_c3' in locals() and 'X_test_c3' in locals():
            scaler_temp = StandardScaler()
            scaler_temp.fit(X_c3)
            X_test_scaled = scaler_temp.transform(X_test_c3)
            test_pred_c3 = calibrated_final_c3.predict_proba(X_test_scaled)
            print(f"   Generated C3: {test_pred_c3.shape}")
        else:
            test_pred_c3 = None

    if test_pred_c1 is not None and test_pred_c2 is not None and test_pred_c3 is not None:
        print(f"\nUsing ensemble method: {final_ensemble_method.upper()}")
        if final_ensemble_method == 'simple':
            final_test_pred = (test_pred_c1 + test_pred_c2 + test_pred_c3) / 3.0
        elif final_ensemble_method in ['optimal', 'weighted']:
            final_test_pred = (
                final_ensemble_weights[0] * test_pred_c1 +
                final_ensemble_weights[1] * test_pred_c2 +
                final_ensemble_weights[2] * test_pred_c3
            )
        elif final_ensemble_method == 'stacked':
            test_meta_features = np.hstack([test_pred_c1, test_pred_c2, test_pred_c3])
            final_test_pred = meta_learner.predict_proba(test_meta_features)
        elif final_ensemble_method == 'rank':
            def probs_to_ranks_test(probs):
                ranks = np.zeros_like(probs, dtype=int)
                for i in range(probs.shape[0]):
                    ranks[i] = np.argsort(np.argsort(-probs[i]))
                return ranks
            ranks_test_c1 = probs_to_ranks_test(test_pred_c1)
            ranks_test_c2 = probs_to_ranks_test(test_pred_c2)
            ranks_test_c3 = probs_to_ranks_test(test_pred_c3)
            avg_ranks_test = (ranks_test_c1 + ranks_test_c2 + ranks_test_c3) / 3.0
            num_classes_test = test_pred_c1.shape[1]
            final_test_pred = np.zeros_like(test_pred_c1, dtype=float)
            for i in range(avg_ranks_test.shape[0]):
                scores = float(num_classes_test) - avg_ranks_test[i]
                scores = np.exp(scores)
                final_test_pred[i] = scores / scores.sum()

        # Safety normalization
        row_sums = final_test_pred.sum(axis=1, keepdims=True)
        final_test_pred = final_test_pred / np.clip(row_sums, 1e-15, None)

        print(f"\nFinal predictions shape: {final_test_pred.shape}")
        print(f"Probability sum check (first 5): {final_test_pred[:5].sum(axis=1)}")

        final_filename = f"submission_FINAL_ENSEMBLE_{final_ensemble_method.upper()}.csv"
        create_and_save_submission(
            predictions=final_test_pred,
            filename=final_filename,
            test_df=test,
            sample_df=sample
        )

        print("\n" + "="*80)
        print("FINAL ENSEMBLE SUBMISSION CREATED")
        print(f"File: {final_filename}")
        print(f"Method: {final_ensemble_method.upper()}")
        print(f"Expected Validation LogLoss: ~{best_loss:.6f}")
        print("="*80)
    else:
        print("\nERROR: Missing test predictions for one or more candidates.")
else:
    print("\nERROR: Ensemble method not determined. Run Section 2 first.")

# Final summary of generated submissions and optional performance echo
print("\n" + "="*80)
print("ALL SUBMISSION FILES GENERATED")
print("="*80)
submission_files = sorted(glob.glob("submission_*.csv"))
if submission_files:
    print(f"\nTotal submissions created: {len(submission_files)}\n")
    buckets = {
        "Step 1 (Baseline)": [f for f in submission_files if 'step1' in f],
        "Step 2 (Embeddings)": [f for f in submission_files if 'step2' in f],
        "Candidate 1 (DeBERTa + LoRA)": [f for f in submission_files if 'candidate1' in f],
        "Candidate 2 (XGBoost/LGBM)": [f for f in submission_files if 'candidate2' in f],
        "Candidate 3 (MLP + All Features)": [f for f in submission_files if 'candidate3' in f],
        "FINAL ENSEMBLE": [f for f in submission_files if 'ENSEMBLE' in f],
    }
    for k, files in buckets.items():
        if files:
            print(k)
            for f in files:
                sz = os.path.getsize(f) / (1024 * 1024)
                cal = "CALIBRATED" if "CALIBRATED" in f else ""
                print(f"   - {f:60s} ({sz:.2f} MB) {cal}")
            print()
else:
    print("\nNo submission files found. Generate predictions first.")

if 'ensemble_results' in locals() and ensemble_results:
    print("\n" + "="*80)
    print("VALIDATION PERFORMANCE SUMMARY")
    print("="*80)
    if 'individual_val_losses' in locals():
        print("\nIndividual Models (Calibrated):")
        for name, loss in sorted(individual_val_losses.items(), key=lambda x: x[1]):
            print(f"   {name:25s}: {loss:.6f}")
    print("\nEnsemble Methods:")
    for name, loss in sorted(ensemble_results.items(), key=lambda x: x[1]):
        print(f"   {name:25s}: {loss:.6f}")
    print("\n" + "="*80)
