# Team7 Assiginment2

## Step1. Setting and Dataset Load

In [None]:
import joblib, os, numpy as np, pandas as pd
import torch

LOCAL, KAGGLE = ".", "/kaggle/input/llm-classification-finetuning"
DATA = LOCAL if os.path.exists("./train.csv") else KAGGLE
train = pd.read_csv(f"{DATA}/train.csv")
test  = pd.read_csv(f"{DATA}/test.csv")
sample = pd.read_csv(f"{DATA}/sample_submission.csv")

need = {"prompt","response_a","response_b","winner_model_a","winner_model_b","winner_tie"}
assert need.issubset(set(train.columns)), f"column: {need - set(train.columns)} is missing in train.csv"
print("DATA:", DATA, train.shape, test.shape)

# target (y)
# 0: model_a win, 1: model_b win, 2: tie
y = train[["winner_model_a", "winner_model_b", "winner_tie"]].values.argmax(1)

device = "cuda" if torch.cuda.is_available() else "cpu"
random_state = 20010815
val_size = 0.2

DATA: . (57477, 9) (3, 4)
cuda


In [3]:

### Global Functions ###
import time
import random

# Set random seeds for reproducibility
np.random.seed(random_state)
random.seed(random_state)
torch.manual_seed(random_state)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_state)

from sentence_transformers import SentenceTransformer
def load_model(candidates, idx=0, device="cpu"):
    # Load
    last_err = None
    path = candidates[idx]
    try:
        print("try:", path)
        model = SentenceTransformer(path, device=device)
        print("loaded model from:", path)
        return model, path
    except Exception as e:
        last_err = e
    raise RuntimeError("Failed to load model. In Kaggle, you need to upload the model folder to Datasets and then link it via 'Add data'. Last error: " + str(last_err))


def build_feat(P, A, B):
    AB_diff = A - B
    AB_adiff = np.abs(AB_diff)
    AB_mul = A * B
    PA_mul = P * A
    PB_mul = P * B
    return np.hstack([P, A, B, AB_diff, AB_adiff, AB_mul, PA_mul, PB_mul])


def l2norm(a, eps=1e-12):
    n = np.linalg.norm(a, axis=1, keepdims=True)
    n = np.clip(n, eps, None)
    return a / n

def encode_texts(model, texts, batch_size=256):
    vecs = []
    total_texts = len(texts)
    total_batches = (total_texts + batch_size - 1) // batch_size

    for i in range(0, len(texts), batch_size):
        start_time = time.time()
        batch = texts[i:i+batch_size].tolist() if isinstance(texts, pd.Series) else texts[i:i+batch_size]
        v = model.encode(batch, batch_size=len(batch), convert_to_numpy=True, normalize_embeddings=False, show_progress_bar=False)
        vecs.append(v)

        batch_num = (i // batch_size) + 1
        print(f"{batch_num}/{total_batches} | time: {time.time() - start_time:.2f}s", end='\r', flush=True)
    V = np.vstack(vecs)
    return l2norm(V)


def create_and_save_submission(predictions, filename, test_df, sample_df):
    """
    Creates a Kaggle submission file from model predictions.
    Then, it normalizes the probabilities, performs validation checks, and saves the file.
    Args:
        predictions (np.array): return value of predict_proba() (N, 3)
        filename (str): csv filename to save the submission.
        test_df (pd.DataFrame): dataframe containing 'id' column.
        sample_df (pd.DataFrame): dataframe to align columns with sample submission.
    """
    print(f"Creating submission file: {filename}...")
    
    # 1. Save Submission File
    sub_df = pd.DataFrame({
        "id": test_df["id"],
        "winner_model_a": predictions[:, 0],
        "winner_model_b": predictions[:, 1],
        "winner_tie":     predictions[:, 2],
    })

    # 2. Normalization check (safety)
    probs = sub_df[["winner_model_a", "winner_model_b", "winner_tie"]].values
    row_sums = probs.sum(axis=1, keepdims=True)
    probs = probs / np.clip(row_sums, 1e-15, None)
    sub_df[["winner_model_a", "winner_model_b", "winner_tie"]] = probs

    # 3. Align columns with sample submission
    try:
        sub_df = sub_df[sample_df.columns]
    except KeyError as e:
        print(f"Warning: Columns in sample_df not found. Saving with default columns. Error: {e}")

    # 4. Save
    sub_df.to_csv(filename, index=False)

    # 5. Assertions to check file integrity
    try:
        chk = pd.read_csv(filename)
        
        assert list(chk.columns) == list(sample_df.columns), \
            f"Column mismatch. Expected: {list(sample_df.columns)}, Got: {list(chk.columns)}"
        
        assert not chk.isna().any().any(), "NaN values found in submission file."
        
        prob_cols = ["winner_model_a", "winner_model_b", "winner_tie"]
        assert np.allclose(chk[prob_cols].sum(1).values, 1.0), \
            "Probabilities do not sum to 1.0 for all rows."
            
        print(f"Successfully saved and verified: {filename} (Shape: {sub_df.shape})")
        
    except FileNotFoundError:
        print(f"Error: File not found after saving: {filename}")
    except AssertionError as e:
        print(f"Error: Submission file verification failed! {e}")
    
    return sub_df

print("All functions loaded.")


  from .autonotebook import tqdm as notebook_tqdm


All functions loaded.


### Step 2. Model Download

In [None]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os

MODELS = {
    "deberta-v3-base": "microsoft/deberta-v3-base",
    "e5-base-v2":      "intfloat/e5-base-v2",
}

BASE_DIR = "./models"
os.makedirs(BASE_DIR, exist_ok=True)

for name, hub_path in MODELS.items():
    save_path = os.path.join(BASE_DIR, name)
    if os.path.exists(save_path) and os.listdir(save_path):
        print(f"[skip] {name} already exists → {save_path}")
        continue

    print(f"[download] {name} from {hub_path}")
    try:
        # 1) SentenceTransformers 
        st_model = SentenceTransformer(hub_path)
        st_model.save(save_path)
        print(f" -> saved (sentence-transformers) to {save_path}")
        continue
    except Exception as e1:
        # 2) Hugging Face transformers
        try:
            tokenizer = AutoTokenizer.from_pretrained(hub_path)
            model = AutoModelForSequenceClassification.from_pretrained(hub_path)
            tokenizer.save_pretrained(save_path)
            model.save_pretrained(save_path)
            print(f" -> saved (transformers) to {save_path}")
        except Exception as e2:
            print(f"[fail] {name}: {e2}")

print("=== Model Download Complete (existing ones skipped) ===")

[skip] all-MiniLM-L6-v2 already exists → ./models/all-MiniLM-L6-v2
[skip] all-MiniLM-L12-v2 already exists → ./models/all-MiniLM-L12-v2
[skip] e5-small-v2 already exists → ./models/e5-small-v2
[skip] e5-base-v2 already exists → ./models/e5-base-v2
[skip] e5-large-v2 already exists → ./models/e5-large-v2
[skip] sentence-t5-base already exists → ./models/sentence-t5-base
[skip] sentence-t5-large already exists → ./models/sentence-t5-large
=== Model Download Complete (existing ones skipped) ===


### Candidate 1: DeBERTa + LoRA ###

In [None]:
### Candidate 1: DeBERTa + LoRA ###
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType
)
from datasets import Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from scipy.special import softmax

# MODEL_NAME = f"microsoft/deberta-v3-base"
MODEL_NAME = f"deberta-v3-base"
max_length = 512 # Max length(tokens) for DeBERTa

LORA_ADAPTER_DIR = f"./models/lora_adapter_{MODEL_NAME.split('/')[-1]}"

print(f"Using model: {MODEL_NAME}")
print(f"LoRA adapter will be saved to: {LORA_ADAPTER_DIR}")

train_df_lora = train.copy()
train_df_lora['labels'] = y

train_df, val_df = train_test_split(
    train_df_lora,
    test_size=val_size,

    stratify=train_df_lora['labels']
)
print(f"Training samples: {len(train_df)}, Validation samples: {len(val_df)}")

# Convert to Hugging Face 'Dataset' object
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load the tokenizer for our model
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
path = f"./models/{MODEL_NAME}"
tokenizer = AutoTokenizer.from_pretrained(path, local_files_only=True)
# model = AutoModelForSequenceClassification.from_pretrained(path, local_files_only=True)

def preprocess_function(examples):
    # This formats the input as: [CLS] prompt [SEP] A: response_a [SEP] B: response_b [SEP]
    # This is a robust way to present the three pieces of text to the model
    
    # Combine response_a and response_b into a single string
    response_pair = [f"A: {a} {tokenizer.sep_token} B: {b}" for a, b in zip(examples['response_a'], examples['response_b'])]
    
    # Tokenize, using prompt as the first sequence and the combined response as the second
    tokenized_inputs = tokenizer(
        examples['prompt'],
        response_pair, # This will be the second sequence
        max_length=max_length,
        truncation=True, # Need to consider whitch option is better
        padding=False # DataCollator will handle dynamic padding
    )
    
    # Add labels
    tokenized_inputs["labels"] = examples["labels"]
    return tokenized_inputs

print("Tokenizing datasets...")
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_df.columns.tolist())
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=val_df.columns.tolist())

# Data collator will dynamically pad batches to the max length *in that batch*
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print("Data preparation complete.")


Using model: microsoft/deberta-v3-base
LoRA adapter will be saved to: ./models/lora_adapter_deberta-v3-base
Training samples: 45981, Validation samples: 11496




Tokenizing datasets...


Map: 100%|██████████| 45981/45981 [00:23<00:00, 1936.88 examples/s]
Map: 100%|██████████| 11496/11496 [00:07<00:00, 1638.31 examples/s]

Data preparation complete.





In [None]:
# Fine-tuning with LoRA (NO Quantization)
# FINAL SOLUTION: Skip quantization entirely - it has compatibility issues with DeBERTa-v3
# Use gradient checkpointing for memory efficiency

# Check if LoRA adapter already exists
if os.path.exists(LORA_ADAPTER_DIR) and os.path.exists(os.path.join(LORA_ADAPTER_DIR, "adapter_config.json")):
    print(f"LoRA adapter already exists at: {LORA_ADAPTER_DIR}")
    print("Skipping training and loading existing adapter...")
    
    # Load base model
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=3,
        local_files_only=True
    )
    
    # Load the saved LoRA adapter
    from peft import PeftModel
    peft_model = PeftModel.from_pretrained(model, LORA_ADAPTER_DIR)
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(LORA_ADAPTER_DIR)
    
    # Create a minimal trainer for prediction only
    training_args = TrainingArguments(
        output_dir=f"./{MODEL_NAME.split('/')[-1]}-checkpoints",
        per_device_eval_batch_size=8,
        fp16=False,
        report_to="none",
    )
    
    trainer = Trainer(
        model=peft_model,
        args=training_args,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    print("Successfully loaded existing LoRA adapter.")
else:
    print("No existing LoRA adapter found. Starting training from scratch...")
    
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=3,
        local_files_only=True
    )

    # Enable gradient checkpointing to save memory
    model.gradient_checkpointing_enable()

    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=[
            "query_proj", 
            "key_proj", 
            "value_proj",
            "dense"
        ],
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.SEQ_CLS,
        inference_mode=False
    )

    # Apply LoRA to the model
    peft_model = get_peft_model(model, lora_config)
    peft_model.print_trainable_parameters()

    # --- Define Custom Compute Metrics ---
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        probs = softmax(logits, axis=1)
        
        eps = 1e-15
        probs = np.clip(probs, eps, 1 - eps)
        
        loss = log_loss(labels, probs)
        return {"log_loss": loss}

    # --- Define Training Arguments ---
    training_args = TrainingArguments(
        output_dir=f"./{MODEL_NAME.split('/')[-1]}-checkpoints",
        num_train_epochs=2,
        per_device_train_batch_size=4,  # Reduced for memory
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=2,  # Effective batch size = 4*2 = 8
        learning_rate=2e-5,
        weight_decay=0.01,

        eval_strategy="steps",
        eval_steps=200,
        save_strategy="steps",
        save_steps=200,

        load_best_model_at_end=True,
        metric_for_best_model="log_loss",
        greater_is_better=False,

        logging_steps=100,
        fp16=False,  # Disable fp16 due to gradient checkpointing conflict
        report_to="none",
        gradient_checkpointing=True,  # Enable gradient checkpointing
    )

    # --- Initialize Trainer ---
    trainer = Trainer(
        model=peft_model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # --- Start Training ---
    print("Starting fine-tuning...")
    trainer.train()

    # --- Save the final LoRA adapter ---
    # This saves only the small, trainable adapter weights
    os.makedirs(LORA_ADAPTER_DIR, exist_ok=True)
    trainer.model.save_pretrained(LORA_ADAPTER_DIR)

    # Also save the tokenizer
    tokenizer.save_pretrained(LORA_ADAPTER_DIR)

    print(f"Training complete. LoRA adapter saved to: {LORA_ADAPTER_DIR}")

# --- Predict on Test Data and Create Kaggle Submission ---
print("\n=== Generating Kaggle Submission ===")

# Make predictions on the tokenized test dataset
def preprocess_test_function(examples):
    # This formats the input as: [CLS] prompt [SEP] A: response_a [SEP] B: response_b [SEP]
    
    # Combine response_a and response_b into a single string
    response_pair = [f"A: {a} {tokenizer.sep_token} B: {b}" for a, b in zip(examples['response_a'], examples['response_b'])]
    
    # Tokenize, using prompt as the first sequence and the combined response as the second
    tokenized_inputs = tokenizer(
        examples['prompt'],
        response_pair, # This will be the second sequence
        max_length=max_length,
        truncation=True, # Need to consider whitch option is better
        padding=False # DataCollator will handle dynamic padding
    )
    return tokenized_inputs
print("Tokenizing test dataset...")
test_dataset = Dataset.from_pandas(test)
tokenized_test_dataset = test_dataset.map(preprocess_test_function, batched=True, remove_columns=test.columns.tolist())

print("Test data tokenization complete.")


LoRA adapter already exists at: ./models/lora_adapter_deberta-v3-base
Skipping training and loading existing adapter...


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Successfully loaded existing LoRA adapter.

=== Generating Kaggle Submission ===
Tokenizing test dataset...


Map: 100%|██████████| 3/3 [00:00<00:00, 280.59 examples/s]


Test data tokenization complete.


Creating submission file: submission_candidate1_deberta_lora_deberta-v3-base.csv...
Successfully saved and verified: submission_candidate1_deberta_lora_deberta-v3-base.csv (Shape: (3, 4))
Kaggle submission file created successfully!


### Candidate 1: Calibration

In [None]:
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import log_loss
import numpy as np

print("\n=== Candidate 1 (DeBERTa + LoRA): Calibration ===")

# Get validation predictions from the existing trainer
val_predictions = trainer.predict(tokenized_val_dataset)
val_logits = val_predictions.predictions
val_probs_before = softmax(val_logits, axis=1)

# Get true labels from val_df
y_val_c1 = val_df['labels'].values

# Calculate log loss BEFORE calibration
logloss_before = log_loss(y_val_c1, val_probs_before)
print(f"Validation LogLoss BEFORE Calibration: {logloss_before:.6f}")

# Apply Isotonic Calibration per class
# We'll calibrate each class probability separately
print("Applying Isotonic calibration...")

calibrators = []
val_probs_calibrated = np.zeros_like(val_probs_before)

for class_idx in range(3):
    # Get probabilities for this class
    class_probs = val_probs_before[:, class_idx]
    
    # Create binary labels (1 if true class, 0 otherwise)
    y_binary = (y_val_c1 == class_idx).astype(int)
    
    # Fit isotonic regression
    iso = IsotonicRegression(out_of_bounds='clip')
    iso.fit(class_probs, y_binary)
    
    # Calibrate
    val_probs_calibrated[:, class_idx] = iso.predict(class_probs)
    
    calibrators.append(iso)

# Normalize probabilities to sum to 1
row_sums = val_probs_calibrated.sum(axis=1, keepdims=True)
val_probs_calibrated = val_probs_calibrated / np.clip(row_sums, 1e-15, None)

# Calculate log loss AFTER calibration
logloss_after = log_loss(y_val_c1, val_probs_calibrated)
print(f"Validation LogLoss AFTER Calibration: {logloss_after:.6f}")
print(f"Improvement: {logloss_before - logloss_after:.6f}")

# Now predict on test set with calibration
print("\nGenerating calibrated predictions for test set...")

# Get test predictions
test_predictions = trainer.predict(tokenized_test_dataset)
test_logits = test_predictions.predictions
test_probs_uncalibrated = softmax(test_logits, axis=1)

# Apply calibration
test_probs_calibrated = np.zeros_like(test_probs_uncalibrated)
for class_idx in range(3):
    class_probs = test_probs_uncalibrated[:, class_idx]
    test_probs_calibrated[:, class_idx] = calibrators[class_idx].predict(class_probs)

joblib.dump(calibrators, './models/candidate_1_calibrators.pkl')
print("Candidate 1 Calibrators SAVED to ./models/candidate_1_calibrators.pkl")


=== Candidate 1 (DeBERTa + LoRA): Calibration ===


Validation LogLoss BEFORE Calibration: 1.098774
Applying Isotonic calibration...
Validation LogLoss AFTER Calibration: 1.073453
Improvement: 0.025321

Generating calibrated predictions for test set...


Creating submission file: submission_candidate1_deberta_lora_CALIBRATED.csv...
Successfully saved and verified: submission_candidate1_deberta_lora_CALIBRATED.csv (Shape: (3, 4))

=== Candidate 1 Summary ===
Before Calibration - Val LogLoss: 1.098774
After Calibration  - Val LogLoss: 1.073453
Final submission saved with calibration.


### Candidate 2: PLM + LightGBM(XGBoost)

In [17]:
### Candidate 2: PLM + LightGBM(XGBoost) ###
import lightgbm as lgb
import xgboost as xgb
import joblib # For saving models
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import time
import os
import torch

# Choose the model to run: "LGBM" or "XGBOOST"

MODEL_NAME = "e5-base-v2" 
BEST_EMBEDDING_MODEL_PATH = f"./models/{MODEL_NAME}"
print(f"Using embedding model: {BEST_EMBEDDING_MODEL_PATH}")

# Load the chosen embedding model
try:
    # We pass a list containing only our chosen model path
    sbert_model, model_src = load_model([BEST_EMBEDDING_MODEL_PATH], idx=0, device=device)
    # print(f"Successfully loaded model from: {model_src}")
except Exception as e:
    print(f"Failed to load model from {BEST_EMBEDDING_MODEL_PATH}. Error: {e}")

print("Encoding texts")
start_time = time.time()

# Encode training data
prompt_emb = encode_texts(sbert_model, train["prompt"])
print("Prompt encoding complete.")
a_emb = encode_texts(sbert_model, train["response_a"])
print("Response A encoding complete.")
b_emb = encode_texts(sbert_model, train["response_b"])
print("Response B encoding complete.")

# Build training features
X_c2 = build_feat(prompt_emb, a_emb, b_emb) # X for candidate 2

# Encode test data
print("Encoding test data...")
prompt_emb_te = encode_texts(sbert_model, test["prompt"])
a_emb_te = encode_texts(sbert_model, test["response_a"])
b_emb_te = encode_texts(sbert_model, test["response_b"])
print("Test encoding complete.")

# Build test features
X_test_c2 = build_feat(prompt_emb_te, a_emb_te, b_emb_te)

# Clean up model from memory
del sbert_model, prompt_emb, a_emb, b_emb, prompt_emb_te, a_emb_te, b_emb_te

print(f"Feature extraction complete. Time taken: {time.time() - start_time:.2f}s")
print(f"Train features shape (X_c2): {X_c2.shape}")
print(f"Test features shape (X_test_c2): {X_test_c2.shape}")

# Create Train/Validation Split
X_tr, X_va, y_tr, y_va = train_test_split(X_c2, y, test_size=val_size, stratify=y, random_state=random_state)
print(f"Data split into: Train {X_tr.shape}, Validation {X_va.shape}")


Using embedding model: ./models/e5-base-v2
try: ./models/e5-base-v2
loaded model from: ./models/e5-base-v2
Encoding texts
Prompt encoding complete.
Response A encoding complete.
Response B encoding complete.
Encoding test data...
Test encoding complete.
Feature extraction complete. Time taken: 1465.02s
Train features shape (X_c2): (57477, 6144)
Test features shape (X_test_c2): (3, 6144)
Data split into: Train (45981, 6144), Validation (11496, 6144)


In [18]:
# Path to save the trained GBM model
GBM_CHOICE = "XGBOOST" 
CANDIDATE_2_MODEL_SAVE_PATH = f"./models/candidate_2_{GBM_CHOICE.lower()}_{BEST_EMBEDDING_MODEL_PATH.split('/')[-1]}.pkl"
print(f"--- Candidate 2: {GBM_CHOICE} ---")
print(f"Model will be saved to: {CANDIDATE_2_MODEL_SAVE_PATH}")

start_time = time.time()

# This dictionary will hold the best validation logloss
val_logloss = {}
clf_c2 = None

if GBM_CHOICE == "LGBM":
    # --- 2.1. LightGBM ---
    clf_c2 = lgb.LGBMClassifier(
        objective='multiclass',
        metric='multi_logloss',
        num_class=3,
        n_estimators=1000,
        learning_rate=0.05,
        n_jobs=-1,
        random_state=random_state,
        # device='gpu' if device == 'cuda' else 'cpu'
        device=device
    )
    
    clf_c2.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric='multi_logloss',
        callbacks=[lgb.early_stopping(100, verbose=True)]
    )
    
    va_pred = clf_c2.predict_proba(X_va)
    val_logloss['LGBM'] = log_loss(y_va, va_pred)
    print(f"--- LGBM Validation LogLoss: {val_logloss['LGBM']:.6f} ---")

elif GBM_CHOICE == "XGBOOST":
    # --- 2.2. XGBoost ---
    clf_c2 = xgb.XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        num_class=3,
        n_estimators=1000,
        learning_rate=0.05,
        n_jobs=-1,
        random_state=random_state,
        device=device,
        early_stopping_rounds=100
    )
    
    clf_c2.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        verbose=True
    )
    
    va_pred = clf_c2.predict_proba(X_va)
    val_logloss['XGBOOST'] = log_loss(y_va, va_pred)
    print(f"--- XGBOOST Validation LogLoss: {val_logloss['XGBOOST']:.6f} ---")

else:
    print(f"Error: Unknown GBM_CHOICE '{GBM_CHOICE}'. Please set to 'LGBM' or 'XGBOOST'.")

print(f"Training complete. Time taken: {time.time() - start_time:.2f}s")

if clf_c2 is not None:
    # Use joblib for cross-compatibility between LGBM/XGB
    joblib.dump(clf_c2, CANDIDATE_2_MODEL_SAVE_PATH)
    print(f"Model saved to: {CANDIDATE_2_MODEL_SAVE_PATH}")

--- Candidate 2: XGBOOST ---
Model will be saved to: ./models/candidate_2_xgboost_e5-base-v2.pkl
[0]	validation_0-mlogloss:1.09635
[1]	validation_0-mlogloss:1.09427
[2]	validation_0-mlogloss:1.09219
[3]	validation_0-mlogloss:1.09043
[4]	validation_0-mlogloss:1.08872
[5]	validation_0-mlogloss:1.08706
[6]	validation_0-mlogloss:1.08541
[7]	validation_0-mlogloss:1.08398
[8]	validation_0-mlogloss:1.08247
[9]	validation_0-mlogloss:1.08107
[10]	validation_0-mlogloss:1.07967
[11]	validation_0-mlogloss:1.07837
[12]	validation_0-mlogloss:1.07707
[13]	validation_0-mlogloss:1.07584
[14]	validation_0-mlogloss:1.07469
[15]	validation_0-mlogloss:1.07350
[16]	validation_0-mlogloss:1.07261
[17]	validation_0-mlogloss:1.07167
[18]	validation_0-mlogloss:1.07075
[19]	validation_0-mlogloss:1.06978
[20]	validation_0-mlogloss:1.06879
[21]	validation_0-mlogloss:1.06788
[22]	validation_0-mlogloss:1.06699
[23]	validation_0-mlogloss:1.06616
[24]	validation_0-mlogloss:1.06532
[25]	validation_0-mlogloss:1.06454
[26

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


--- XGBOOST Validation LogLoss: 1.035757 ---
Training complete. Time taken: 120.73s
Model saved to: ./models/candidate_2_xgboost_e5-base-v2.pkl


In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss
import numpy as np

print(f"\n=== Candidate 2 ({GBM_CHOICE}): Calibration ===")

# Get validation predictions BEFORE calibration
# We need to recreate the train/val split from earlier
X_tr_c2, X_va_c2, y_tr_c2, y_va_c2 = train_test_split(
    X_c2, y, test_size=val_size, stratify=y, random_state=random_state
)

# Get predictions from the validation model (clf_c2 trained on X_tr, X_va split)
if 'clf_c2' in locals() and clf_c2 is not None:
    va_pred_before = clf_c2.predict_proba(X_va_c2)
    logloss_before = log_loss(y_va_c2, va_pred_before)
    print(f"Validation LogLoss BEFORE Calibration: {logloss_before:.6f}")
    
    # Apply calibration (using isotonic regression)
    print("Applying Isotonic calibration...")
    calibrated_model_c2 = CalibratedClassifierCV(
        clf_c2,
        method='isotonic',  # isotonic or sigmoid
        cv='prefit',  # Model is already fitted
        ensemble=False
    )
    
    # Fit calibration on validation set
    calibrated_model_c2.fit(X_va_c2, y_va_c2)
    
    # Get calibrated predictions on validation set
    va_pred_after = calibrated_model_c2.predict_proba(X_va_c2)
    logloss_after = log_loss(y_va_c2, va_pred_after)
    print(f"Validation LogLoss AFTER Calibration: {logloss_after:.6f}")
    print(f"Improvement: {logloss_before - logloss_after:.6f}")
    
    # Now retrain on full data and apply calibration
    print("\nRetraining on full data for final submission...")
    
    # Get best iteration
    try:
        best_iter = clf_c2.best_iteration_ or 1000
    except AttributeError:
        best_iter = 1000
    
    X_train_full, X_cal, y_train_full, y_cal = train_test_split(
        X_c2, y, test_size=val_size, stratify=y, random_state=random_state
    )
    
    if GBM_CHOICE == "LGBM":
        clf_c2_for_calib = lgb.LGBMClassifier(
            objective='multiclass', metric='multi_logloss', num_class=3,
            n_estimators=best_iter, learning_rate=0.05,
            n_jobs=-1, random_state=random_state, device=device
        )
    elif GBM_CHOICE == "XGBOOST":
        clf_c2_for_calib = xgb.XGBClassifier(
            objective='multi:softprob', eval_metric='mlogloss', num_class=3,
            n_estimators=best_iter, learning_rate=0.05,
            n_jobs=-1, random_state=random_state, device=device
        )
    
    clf_c2_for_calib.fit(X_train_full, y_train_full)
    
    # Apply calibration on the hold-out 20%
    calibrated_final_c2 = CalibratedClassifierCV(
        clf_c2_for_calib,
        method='isotonic',
        cv='prefit',
        ensemble=False
    )
    calibrated_final_c2.fit(X_cal, y_cal)
    
    C2_CALIBRATED_MODEL_PATH = f"./models/candidate_2_{GBM_CHOICE}_{MODEL_NAME}_CALIBRATED.pkl"
    joblib.dump(calibrated_final_c2, C2_CALIBRATED_MODEL_PATH)
    print(f"Candidate 2 Calibrated Model SAVED to {C2_CALIBRATED_MODEL_PATH}")
    
    print(f"\n=== Candidate 2 Summary ===")
    print(f"Before Calibration - Val LogLoss: {logloss_before:.6f}")
    print(f"After Calibration  - Val LogLoss: {logloss_after:.6f}")
    print(f"Final submission saved with calibration.")
else:
    print("Error: clf_c2 model not found. Please run the training cell first.")

### Candidate 3: All Features + MLP

In [6]:
### Candidate 3: All Features + MLP ###
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import joblib
import os

# Define the "Strong" lexical feature builder
def stats_strong(s):
    """Calculates a comprehensive set of lexical statistics."""
    if not isinstance(s, str): s = ""
    toks = s.split()
    return {
        "len_char": len(s),
        "len_tok": len(toks),
        "num_sent": sum(s.count(x) for x in [".", "!", "?"]),
        "num_code": s.count("`"),
        "num_list": s.count("- ") + s.count("* "),
        "num_upper": sum(ch.isupper() for ch in s),
        "num_punct": sum(ch in ",;:()" for ch in s),
        "avg_tok_len": (sum(len(t) for t in toks) / len(toks)) if toks else 0.0,
    }

def build_strong_lexical_features(df):
    """Builds the full set of lexical and bias features."""
    rows = []
    cols = ["prompt", "response_a", "response_b"]
    
    for p, a, b in zip(df[cols[0]], df[cols[1]], df[cols[2]]):
        ps, as_, bs = stats_strong(p), stats_strong(a), stats_strong(b)
        rows.append({
            "p_len_char": ps["len_char"], "p_len_tok": ps["len_tok"], "p_num_sent": ps["num_sent"],
            "a_len_char": as_["len_char"], "a_len_tok": as_["len_tok"], "a_num_sent": as_["num_sent"],
            "a_num_code": as_["num_code"], "a_num_list": as_["num_list"], "a_num_upper": as_["num_upper"],
            "a_num_punct": as_["num_punct"], "a_avg_tok_len": as_["avg_tok_len"],
            "b_len_char": bs["len_char"], "b_len_tok": bs["len_tok"], "b_num_sent": bs["num_sent"],
            "b_num_code": bs["num_code"], "b_num_list": bs["num_list"], "b_num_upper": bs["num_upper"],
            "b_num_punct": bs["num_punct"], "b_avg_tok_len": bs["avg_tok_len"],
            # A-B Differences
            "d_len_char": as_["len_char"] - bs["len_char"],
            "d_len_tok": as_["len_tok"] - bs["len_tok"],
            "d_num_sent": as_["num_sent"] - bs["num_sent"],
            "d_num_code": as_["num_code"] - bs["num_code"],
            "d_num_list": as_["num_list"] - bs["num_list"],
            "d_num_upper": as_["num_upper"] - bs["num_upper"],
            "d_num_punct": as_["num_punct"] - bs["num_punct"],
            "d_avg_tok_len": as_["avg_tok_len"] - bs["avg_tok_len"],
            # Ratios
            "r_len_char": (as_["len_char"] + 1) / (bs["len_char"] + 1),
            "r_len_tok": (as_["len_tok"] + 1) / (bs["len_tok"] + 1),
            "r_num_sent": (as_["num_sent"] + 1) / (bs["num_sent"] + 1),
        })
    return pd.DataFrame(rows)

print("--- Candidate 3: All Features + MLP ---")
print("Building strong lexical features...")

# Generate Lexical Features
X_lex_strong = build_strong_lexical_features(train)
X_test_lex_strong = build_strong_lexical_features(test)

print(f"Strong lexical features shape: {X_lex_strong.shape}")

--- Candidate 3: All Features + MLP ---
Building strong lexical features...
Strong lexical features shape: (57477, 30)


In [7]:
import joblib # For saving models
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import time
import os
import torch

MODEL_NAME = "e5-base-v2" 
BEST_EMBEDDING_MODEL_PATH = f"./models/{MODEL_NAME}"
print(f"Using embedding model: {BEST_EMBEDDING_MODEL_PATH}")

# Load the chosen embedding model
try:
    # We pass a list containing only our chosen model path
    sbert_model, model_src = load_model([BEST_EMBEDDING_MODEL_PATH], idx=0, device=device)
    # print(f"Successfully loaded model from: {model_src}")
except Exception as e:
    print(f"Failed to load model from {BEST_EMBEDDING_MODEL_PATH}. Error: {e}")

print("Encoding texts")
start_time = time.time()

# Encode training data
prompt_emb = encode_texts(sbert_model, train["prompt"])
print("Prompt encoding complete.")
a_emb = encode_texts(sbert_model, train["response_a"])
print("Response A encoding complete.")
b_emb = encode_texts(sbert_model, train["response_b"])
print("Response B encoding complete.")

# Build training features
X_c3 = build_feat(prompt_emb, a_emb, b_emb) # X for candidate 3

# Encode test data
print("Encoding test data...")
prompt_emb_te = encode_texts(sbert_model, test["prompt"])
a_emb_te = encode_texts(sbert_model, test["response_a"])
b_emb_te = encode_texts(sbert_model, test["response_b"])
print("Encoding complete for test data.")

# Build test features
X_test_c3 = build_feat(prompt_emb_te, a_emb_te, b_emb_te)

# Clean up model from memory
del sbert_model, prompt_emb, a_emb, b_emb, prompt_emb_te, a_emb_te, b_emb_te

print(f"Feature extraction complete. Time taken: {time.time() - start_time:.2f}s")
print(f"Train features shape (X_c3): {X_c3.shape}")
print(f"Test features shape (X_test_c3): {X_test_c3.shape}")

# Create Train/Validation Split
X_tr, X_va, y_tr, y_va = train_test_split(X_c3, y, test_size=val_size, stratify=y)
print(f"Data split into: Train {X_tr.shape}, Validation {X_va.shape}")

Using embedding model: ./models/e5-base-v2
try: ./models/e5-base-v2
loaded model from: ./models/e5-base-v2
Encoding texts
Prompt encoding complete.
Response A encoding complete.
Response B encoding complete.
Encoding test data...
Encoding complete for test data.
Feature extraction complete. Time taken: 1461.08s
Train features shape (X_c3): (57477, 6144)
Test features shape (X_test_c3): (3, 6144)
Data split into: Train (45981, 6144), Validation (11496, 6144)


In [8]:

# Handle NaNs/Infs for safety
X_lex_strong = X_lex_strong.fillna(0).replace([np.inf, -np.inf], 0)
X_test_lex_strong = X_test_lex_strong.fillna(0).replace([np.inf, -np.inf], 0)

# Convert embedding features (numpy) to DataFrame for safe processing
X_c3_safe = pd.DataFrame(X_c3).fillna(0).replace([np.inf, -np.inf], 0).values
X_test_c3_safe = pd.DataFrame(X_test_c3).fillna(0).replace([np.inf, -np.inf], 0).values

print("Combining lexical and embedding features...")
X_c3 = np.hstack([X_lex_strong, X_c3_safe])
X_test_c3 = np.hstack([X_test_lex_strong, X_test_c3_safe])
print(f"Combined train features shape (X_c3): {X_c3.shape}")
print(f"Combined test features shape (X_test_c3): {X_test_c3.shape}")

X_tr, X_va, y_tr, y_va = train_test_split(X_c3, y, test_size=val_size, stratify=y, random_state=random_state)

print("Scaling features...")
scaler_c3 = StandardScaler()
X_tr_sc = scaler_c3.fit_transform(X_tr)
X_va_sc = scaler_c3.transform(X_va)

# Train MLP Classifier
print("Training MLPClassifier...")
start_time = time.time()

clf_c3 = MLPClassifier(
    hidden_layer_sizes=(512, 256),
    activation="relu",
    solver="adam",
    alpha=1e-4,          # L2 regularization
    batch_size=512,
    learning_rate_init=1e-3,
    max_iter=100,        
    early_stopping=True,
    n_iter_no_change=5,
    random_state=random_state,
    verbose=True
)

clf_c3.fit(X_tr_sc, y_tr)

va_pred = clf_c3.predict_proba(X_va_sc)
val_logloss = log_loss(y_va, va_pred)

print(f"Training complete. Time taken: {time.time() - start_time:.2f}s")
print(f"--- MLP (All Features) Validation LogLoss: {val_logloss:.6f} ---")

CANDIDATE_3_MODEL_SAVE_PATH = "./models/candidate_3_mlp.pkl"
CANDIDATE_3_SCALER_SAVE_PATH = "./models/candidate_3_scaler.pkl"

joblib.dump(clf_c3, CANDIDATE_3_MODEL_SAVE_PATH)
joblib.dump(scaler_c3, CANDIDATE_3_SCALER_SAVE_PATH)

print(f"MLP model saved to: {CANDIDATE_3_MODEL_SAVE_PATH}")
print(f"Scaler saved to: {CANDIDATE_3_SCALER_SAVE_PATH}")


Combining lexical and embedding features...
Combined train features shape (X_c3): (57477, 6174)
Combined test features shape (X_test_c3): (3, 6174)
Scaling features...
Training MLPClassifier...
Iteration 1, loss = 1.12600698
Validation score: 0.463796
Iteration 2, loss = 0.82086238
Validation score: 0.447271
Iteration 3, loss = 0.53996426
Validation score: 0.430093
Iteration 4, loss = 0.26805010
Validation score: 0.429441
Iteration 5, loss = 0.12704382
Validation score: 0.448141
Iteration 6, loss = 0.06509657
Validation score: 0.437486
Iteration 7, loss = 0.04185474
Validation score: 0.436834
Validation score did not improve more than tol=0.000100 for 5 consecutive epochs. Stopping.
Training complete. Time taken: 56.44s
--- MLP (All Features) Validation LogLoss: 1.048909 ---
MLP model saved to: ./models/candidate_3_mlp.pkl
Scaler saved to: ./models/candidate_3_scaler.pkl


In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss
import numpy as np

print("\n=== Candidate 3 (MLP All Features): Calibration ===")

# Get validation predictions BEFORE calibration
# We need to recreate the train/val split from earlier
X_tr_c3, X_va_c3, y_tr_c3, y_va_c3 = train_test_split(
    X_c3, y, test_size=val_size, stratify=y, random_state=random_state
)

# Scale validation data
scaler_c3_val = StandardScaler()
X_tr_sc_c3 = scaler_c3_val.fit_transform(X_tr_c3)
X_va_sc_c3 = scaler_c3_val.transform(X_va_c3)

# Get predictions from the validation model (clf_c3 trained on X_tr, X_va split)
if 'clf_c3' in locals() and clf_c3 is not None:
    va_pred_before = clf_c3.predict_proba(X_va_sc_c3)
    logloss_before = log_loss(y_va_c3, va_pred_before)
    print(f"Validation LogLoss BEFORE Calibration: {logloss_before:.6f}")
    
    # Apply calibration (using isotonic regression)
    print("Applying Isotonic calibration...")
    calibrated_model_c3 = CalibratedClassifierCV(
        clf_c3,
        method='isotonic',  # isotonic or sigmoid
        cv='prefit',  # Model is already fitted
        ensemble=False
    )
    
    # Fit calibration on validation set
    calibrated_model_c3.fit(X_va_sc_c3, y_va_c3)
    
    # Get calibrated predictions on validation set
    va_pred_after = calibrated_model_c3.predict_proba(X_va_sc_c3)
    logloss_after = log_loss(y_va_c3, va_pred_after)
    print(f"Validation LogLoss AFTER Calibration: {logloss_after:.6f}")
    print(f"Improvement: {logloss_before - logloss_after:.6f}")
    
    # Now retrain on full data and apply calibration
    print("\nRetraining on full data for final submission...")
    0
    X_train_full, X_cal, y_train_full, y_cal = train_test_split(
        X_c3, y, test_size=val_size, stratify=y, random_state=random_state
    )
    
    # Scale
    scaler_final_c3 = StandardScaler()
    X_train_full_sc = scaler_final_c3.fit_transform(X_train_full)
    X_cal_sc = scaler_final_c3.transform(X_cal)
    X_test_final_sc = scaler_final_c3.transform(X_test_c3)
    
    # Train MLP
    clf_c3_for_calib = MLPClassifier(
        hidden_layer_sizes=(512, 256), activation="relu", solver="adam",
        alpha=1e-4, batch_size=512, learning_rate_init=1e-3,
        max_iter=100, early_stopping=True, n_iter_no_change=5,
        random_state=random_state, verbose=False
    )
    clf_c3_for_calib.fit(X_train_full_sc, y_train_full)
    
    # Apply calibration on the hold-out 20%
    calibrated_final_c3 = CalibratedClassifierCV(
        clf_c3_for_calib,
        method='isotonic',
        cv='prefit',
        ensemble=False
    )
    calibrated_final_c3.fit(X_cal_sc, y_cal)
    
    C3_CALIBRATED_MODEL_PATH = "./models/candidate_3_MLP_CALIBRATED.pkl"
    C3_FINAL_SCALER_PATH = "./models/candidate_3_scaler_final.pkl"
    joblib.dump(calibrated_final_c3, C3_CALIBRATED_MODEL_PATH)
    joblib.dump(scaler_final_c3, C3_FINAL_SCALER_PATH)
    print(f"Candidate 3 Calibrated Model SAVED to {C3_CALIBRATED_MODEL_PATH}")
    print(f"Candidate 3 Final Scaler SAVED to {C3_FINAL_SCALER_PATH}")
    
    print(f"\n=== Candidate 3 Summary ===")
    print(f"Before Calibration - Val LogLoss: {logloss_before:.6f}")
    print(f"After Calibration  - Val LogLoss: {logloss_after:.6f}")
    print(f"Final submission saved with calibration.")
else:
    print("Error: clf_c3 model not found. Please run the training cell first.")


=== Candidate 3 (MLP All Features): Calibration ===
Validation LogLoss BEFORE Calibration: 1.048909
Applying Isotonic calibration...




Validation LogLoss AFTER Calibration: 1.036883
Improvement: 0.012026

Retraining on full data for final submission...
Creating submission file: submission_candidate3_MLP_AllFeatures_CALIBRATED.csv...
Successfully saved and verified: submission_candidate3_MLP_AllFeatures_CALIBRATED.csv (Shape: (3, 4))

=== Candidate 3 Summary ===
Before Calibration - Val LogLoss: 1.048909
After Calibration  - Val LogLoss: 1.036883
Final submission saved with calibration.


