# Team7 Assignment 2: Kaggle Inference Notebook (LLM Classification Finetuning)

### Setting and Dataset Load

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
import os, numpy as np, pandas as pd
import torch
import joblib, time
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from scipy.optimize import minimize
from scipy.special import softmax
from peft import PeftModel
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import Dataset
from sklearn.calibration import CalibratedClassifierCV
import glob

# =========================================================================
# Need to set path for Kaggle Dataset
# =========================================================================
KAGGLE_MODEL_DIR = "../models"
KAGGLE_DATA_DIR = "../datasets"
BASE_DIR = KAGGLE_MODEL_DIR

# Data Load
train = pd.read_csv(f"{KAGGLE_DATA_DIR}/train.csv")
test  = pd.read_csv(f"{KAGGLE_DATA_DIR}/test.csv")
sample = pd.read_csv(f"{KAGGLE_DATA_DIR}/sample_submission.csv")

print("DATA:", KAGGLE_DATA_DIR, train.shape, test.shape)

# 0: model_a win, 1: model_b win, 2: tie
y = train[["winner_model_a", "winner_model_b", "winner_tie"]].values.argmax(1)

random_state = 20010815
val_size = 0.2
device = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


DATA: ../datasets (57477, 9) (3, 4)


In [3]:

### Global Functions ###
import time
import random

# Set random seeds for reproducibility
np.random.seed(random_state)
random.seed(random_state)
torch.manual_seed(random_state)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_state)

from sentence_transformers import SentenceTransformer
def load_model(candidates, idx=0, device="cpu"):
    # Load
    last_err = None
    path = candidates[idx]
    try:
        print("try:", path)
        model = SentenceTransformer(path, device=device)
        print("loaded model from:", path)
        return model, path
    except Exception as e:
        last_err = e
    raise RuntimeError("Failed to load model. In Kaggle, you need to upload the model folder to Datasets and then link it via 'Add data'. Last error: " + str(last_err))


def build_feat(P, A, B):
    AB_diff = A - B
    AB_adiff = np.abs(AB_diff)
    AB_mul = A * B
    PA_mul = P * A
    PB_mul = P * B
    return np.hstack([P, A, B, AB_diff, AB_adiff, AB_mul, PA_mul, PB_mul])


def l2norm(a, eps=1e-12):
    n = np.linalg.norm(a, axis=1, keepdims=True)
    n = np.clip(n, eps, None)
    return a / n

def encode_texts(model, texts, batch_size=256):
    vecs = []
    total_texts = len(texts)
    total_batches = (total_texts + batch_size - 1) // batch_size

    for i in range(0, len(texts), batch_size):
        start_time = time.time()
        batch = texts[i:i+batch_size].tolist() if isinstance(texts, pd.Series) else texts[i:i+batch_size]
        v = model.encode(batch, batch_size=len(batch), convert_to_numpy=True, normalize_embeddings=False, show_progress_bar=False)
        vecs.append(v)

        batch_num = (i // batch_size) + 1
        print(f"{batch_num}/{total_batches} | time: {time.time() - start_time:.2f}s", end='\r', flush=True)
    V = np.vstack(vecs)
    return l2norm(V)


def create_and_save_submission(predictions, filename, test_df, sample_df):
    """
    Creates a Kaggle submission file from model predictions.
    Then, it normalizes the probabilities, performs validation checks, and saves the file.
    Args:
        predictions (np.array): return value of predict_proba() (N, 3)
        filename (str): csv filename to save the submission.
        test_df (pd.DataFrame): dataframe containing 'id' column.
        sample_df (pd.DataFrame): dataframe to align columns with sample submission.
    """
    print(f"Creating submission file: {filename}...")
    
    # 1. Save Submission File
    sub_df = pd.DataFrame({
        "id": test_df["id"],
        "winner_model_a": predictions[:, 0],
        "winner_model_b": predictions[:, 1],
        "winner_tie":     predictions[:, 2],
    })

    # 2. Normalization check (safety)
    probs = sub_df[["winner_model_a", "winner_model_b", "winner_tie"]].values
    row_sums = probs.sum(axis=1, keepdims=True)
    probs = probs / np.clip(row_sums, 1e-15, None)
    sub_df[["winner_model_a", "winner_model_b", "winner_tie"]] = probs

    # 3. Align columns with sample submission
    try:
        sub_df = sub_df[sample_df.columns]
    except KeyError as e:
        print(f"Warning: Columns in sample_df not found. Saving with default columns. Error: {e}")

    # 4. Save
    sub_df.to_csv(filename, index=False)

    # 5. Assertions to check file integrity
    try:
        chk = pd.read_csv(filename)
        
        assert list(chk.columns) == list(sample_df.columns), \
            f"Column mismatch. Expected: {list(sample_df.columns)}, Got: {list(chk.columns)}"
        
        assert not chk.isna().any().any(), "NaN values found in submission file."
        
        prob_cols = ["winner_model_a", "winner_model_b", "winner_tie"]
        assert np.allclose(chk[prob_cols].sum(1).values, 1.0), \
            "Probabilities do not sum to 1.0 for all rows."
            
        print(f"Successfully saved and verified: {filename} (Shape: {sub_df.shape})")
        
    except FileNotFoundError:
        print(f"Error: File not found after saving: {filename}")
    except AssertionError as e:
        print(f"Error: Submission file verification failed! {e}")
    
    return sub_df

def build_strong_lexical_features(df):
    """Builds the full set of lexical and bias features."""
    rows = []
    cols = ["prompt", "response_a", "response_b"]
    
    for p, a, b in zip(df[cols[0]], df[cols[1]], df[cols[2]]):
        ps, as_, bs = stats_strong(p), stats_strong(a), stats_strong(b)
        rows.append({
            "p_len_char": ps["len_char"], "p_len_tok": ps["len_tok"], "p_num_sent": ps["num_sent"],
            "a_len_char": as_["len_char"], "a_len_tok": as_["len_tok"], "a_num_sent": as_["num_sent"],
            "a_num_code": as_["num_code"], "a_num_list": as_["num_list"], "a_num_upper": as_["num_upper"],
            "a_num_punct": as_["num_punct"], "a_avg_tok_len": as_["avg_tok_len"],
            "b_len_char": bs["len_char"], "b_len_tok": bs["len_tok"], "b_num_sent": bs["num_sent"],
            "b_num_code": bs["num_code"], "b_num_list": bs["num_list"], "b_num_upper": bs["num_upper"],
            "b_num_punct": bs["num_punct"], "b_avg_tok_len": bs["avg_tok_len"],
            # A-B Differences
            "d_len_char": as_["len_char"] - bs["len_char"],
            "d_len_tok": as_["len_tok"] - bs["len_tok"],
            "d_num_sent": as_["num_sent"] - bs["num_sent"],
            "d_num_code": as_["num_code"] - bs["num_code"],
            "d_num_list": as_["num_list"] - bs["num_list"],
            "d_num_upper": as_["num_upper"] - bs["num_upper"],
            "d_num_punct": as_["num_punct"] - bs["num_punct"],
            "d_avg_tok_len": as_["avg_tok_len"] - bs["avg_tok_len"],
            # Ratios
            "r_len_char": (as_["len_char"] + 1) / (bs["len_char"] + 1),
            "r_len_tok": (as_["len_tok"] + 1) / (bs["len_tok"] + 1),
            "r_num_sent": (as_["num_sent"] + 1) / (bs["num_sent"] + 1),
        })
    return pd.DataFrame(rows)

# Define the "Strong" lexical feature builder
def stats_strong(s):
    """Calculates a comprehensive set of lexical statistics."""
    if not isinstance(s, str): s = ""
    toks = s.split()
    return {
        "len_char": len(s),
        "len_tok": len(toks),
        "num_sent": sum(s.count(x) for x in [".", "!", "?"]),
        "num_code": s.count("`"),
        "num_list": s.count("- ") + s.count("* "),
        "num_upper": sum(ch.isupper() for ch in s),
        "num_punct": sum(ch in ",;:()" for ch in s),
        "avg_tok_len": (sum(len(t) for t in toks) / len(toks)) if toks else 0.0,
    }

def preprocess_function(examples):
    # This formats the input as: [CLS] prompt [SEP] A: response_a [SEP] B: response_b [SEP]
    # This is a robust way to present the three pieces of text to the model
    
    # Combine response_a and response_b into a single string
    response_pair = [f"A: {a} {tokenizer.sep_token} B: {b}" for a, b in zip(examples['response_a'], examples['response_b'])]
    
    # Tokenize, using prompt as the first sequence and the combined response as the second
    tokenized_inputs = tokenizer(
        examples['prompt'],
        response_pair, # This will be the second sequence
        max_length=max_length,
        truncation=True, # Need to consider whitch option is better
        padding=False # DataCollator will handle dynamic padding
    )
    
    # Add labels
    tokenized_inputs["labels"] = examples["labels"]
    return tokenized_inputs


print("All functions loaded.")


All functions loaded.


### Step 1. Load All Calibrated Models and Predict on Test Data

In [4]:
### C1: DeBERTa + LoRA (Calibrated) ###
MODEL_NAME_C1 = "deberta-v3-base"
LORA_ADAPTER_DIR = f"{BASE_DIR}/lora_adapter_{MODEL_NAME_C1}"
C1_CALIBRATOR_PATH = f"{BASE_DIR}/candidate_1_calibrators.pkl"
max_length = 512

print("\n--- Candidate 1: DeBERTa + LoRA ---")
# 1. Load Base Model and Peft Adapter
try:
    base_model_path = f"{BASE_DIR}/{MODEL_NAME_C1}"
    base_model = AutoModelForSequenceClassification.from_pretrained(
        base_model_path,
        num_labels=3,
        local_files_only=True
    )
    peft_model = PeftModel.from_pretrained(base_model, LORA_ADAPTER_DIR)
    peft_model.to(device)
    peft_model.eval()
    tokenizer = AutoTokenizer.from_pretrained(LORA_ADAPTER_DIR, local_files_only=True)
    print("C1 model loaded successfully.")
except Exception as e:
    print(f"ERROR loading C1 model: {e}")
    peft_model = None

if peft_model:
    # 2. Tokenize Test Data (using C1's preprocessing logic)
    def preprocess_test_function(examples):
        response_pair = [f"A: {a} {tokenizer.sep_token} B: {b}" for a, b in zip(examples['response_a'], examples['response_b'])]
        return tokenizer(
            examples['prompt'], response_pair, max_length=max_length, truncation=True, padding='max_length'
        )
    test_dataset = Dataset.from_pandas(test)
    tokenized_test_dataset = test_dataset.map(preprocess_test_function, batched=True, remove_columns=test.columns.tolist())
    tokenized_test_dataset.set_format("torch")
    
    # 3. Predict Uncalibrated Probabilities
    from torch.utils.data import DataLoader
    dl = DataLoader(tokenized_test_dataset, batch_size=8)
    all_logits = []
    with torch.no_grad():
        for batch in dl:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = peft_model(**batch)
            all_logits.append(outputs.logits.cpu().numpy())
    test_logits = np.vstack(all_logits)
    test_probs_uncalibrated = softmax(test_logits, axis=1)
    
    # 4. Load Calibrator and Calibrate
    try:
        calibrators = joblib.load(C1_CALIBRATOR_PATH)
        test_probs_calibrated_c1 = np.zeros_like(test_probs_uncalibrated)
        for class_idx in range(3):
            test_probs_calibrated_c1[:, class_idx] = calibrators[class_idx].predict(test_probs_uncalibrated[:, class_idx])
        
        # Normalize probabilities to sum to 1
        row_sums = test_probs_calibrated_c1.sum(axis=1, keepdims=True)
        test_probs_calibrated_c1 = test_probs_calibrated_c1 / np.clip(row_sums, 1e-15, None)
        print("C1 test predictions successfully calibrated.")
    except Exception as e:
        print(f"ERROR loading or applying C1 calibrator: {e}. Using uncalibrated.")
        test_probs_calibrated_c1 = test_probs_uncalibrated
else:
    test_probs_calibrated_c1 = np.zeros((len(test), 3)) # Placeholder for missing model
    print("C1 skipped due to loading error.")


### C2: PLM + XGBoost (Calibrated) ###
MODEL_NAME_C2 = "e5-base-v2"
C2_MODEL_PATH = f"{BASE_DIR}/{MODEL_NAME_C2}"
GBM_CHOICE = "XGBOOST" # Assuming XGBOOST was the choice
C2_CALIBRATED_MODEL_PATH = f"{BASE_DIR}/candidate_2_{GBM_CHOICE}_{MODEL_NAME_C2}_CALIBRATED.pkl"

print("\n--- Candidate 2: PLM + XGBoost ---")
test_pred_c2_calibrated = None

try:
    # 1. Load Embedding Model and Generate Test Features (X_test_c2)
    sbert_model, _ = load_model([C2_MODEL_PATH], idx=0, device=device)
    prompt_emb_te = encode_texts(sbert_model, test["prompt"])
    a_emb_te = encode_texts(sbert_model, test["response_a"])
    b_emb_te = encode_texts(sbert_model, test["response_b"])
    X_test_c2 = build_feat(prompt_emb_te, a_emb_te, b_emb_te)
    del sbert_model, prompt_emb_te, a_emb_te, b_emb_te
    print("C2 test features (X_test_c2) generated.")
    
    # 2. Load Final Calibrated Model and Predict
    calibrated_final_c2 = joblib.load(C2_CALIBRATED_MODEL_PATH)
    test_pred_c2_calibrated = calibrated_final_c2.predict_proba(X_test_c2)
    print("C2 test predictions successfully generated and calibrated.")
    
except Exception as e:
    print(f"ERROR in C2 processing: {e}")
    X_test_c2 = None
    test_pred_c2_calibrated = np.zeros((len(test), 3)) # Placeholder


### C3: All Features + MLP (Calibrated) ###
MODEL_NAME_C3 = "e5-base-v2"
C3_MODEL_PATH = f"{BASE_DIR}/{MODEL_NAME_C3}"
C3_CALIBRATED_MODEL_PATH = f"{BASE_DIR}/candidate_3_MLP_CALIBRATED.pkl"
C3_FINAL_SCALER_PATH = f"{BASE_DIR}/candidate_3_scaler_final.pkl"

print("\n--- Candidate 3: All Features + MLP ---")
test_pred_c3_calibrated = None

try:
    # 1. Load Embedding Model and Generate Embedding Features
    sbert_model, _ = load_model([C3_MODEL_PATH], idx=0, device=device)
    prompt_emb_te = encode_texts(sbert_model, test["prompt"])
    a_emb_te = encode_texts(sbert_model, test["response_a"])
    b_emb_te = encode_texts(sbert_model, test["response_b"])
    X_emb_test_c3 = build_feat(prompt_emb_te, a_emb_te, b_emb_te)
    del sbert_model, prompt_emb_te, a_emb_te, b_emb_te
    
    # 2. Generate Lexical Features
    X_test_lex_strong = build_strong_lexical_features(test)
    
    # 3. Combine and Clean Features (X_test_c3)
    X_test_lex_strong = X_test_lex_strong.fillna(0).replace([np.inf, -np.inf], 0)
    X_emb_test_c3_safe = pd.DataFrame(X_emb_test_c3).fillna(0).replace([np.inf, -np.inf], 0).values
    X_test_c3 = np.hstack([X_test_lex_strong, X_emb_test_c3_safe])
    print("C3 all test features (X_test_c3) generated.")
    
    # 4. Load Scaler, Scale Features
    scaler_final_c3 = joblib.load(C3_FINAL_SCALER_PATH)
    X_test_scaled_c3 = scaler_final_c3.transform(X_test_c3)
    print("C3 features scaled.")
    
    # 5. Load Final Calibrated Model and Predict
    calibrated_final_c3 = joblib.load(C3_CALIBRATED_MODEL_PATH)
    test_pred_c3_calibrated = calibrated_final_c3.predict_proba(X_test_scaled_c3)
    print("C3 test predictions successfully generated and calibrated.")
    
except Exception as e:
    print(f"ERROR in C3 processing: {e}")
    test_pred_c3_calibrated = np.zeros((len(test), 3)) # Placeholder


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at ../models/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Candidate 1: DeBERTa + LoRA ---
C1 model loaded successfully.


Map: 100%|██████████| 3/3 [00:00<00:00, 192.42 examples/s]


C1 test predictions successfully calibrated.

--- Candidate 2: PLM + XGBoost ---
try: ../models/e5-base-v2
loaded model from: ../models/e5-base-v2
C2 test features (X_test_c2) generated.
C2 test predictions successfully generated and calibrated.

--- Candidate 3: All Features + MLP ---
try: ../models/e5-base-v2


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


loaded model from: ../models/e5-base-v2
C3 all test features (X_test_c3) generated.
C3 features scaled.
C3 test predictions successfully generated and calibrated.


### Step 2. Final Ensemble: Stacked Generalization

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

print("\n--- Ensemble Model (Stacked Generalization) Training ---")
ENSEMBLE_MODEL_PATH = f"{BASE_DIR}/meta_learner_stacked.pkl"

# 1. Load Calibrated Validation Predictions (from 01_local_training artifacts)
# Note: The ensemble logic relies on having the original validation predictions (val_pred_c1, val_pred_c2, val_pred_c3) 
#       and the corresponding true labels (y_val_ensemble, y_va_c2_ens, y_va_c3_ens) that were used to find the best strategy.

# We will assume the best strategy was 'Stacked_Generalization' based on the result printout.
# To train the final meta-learner, we use the full training set predictions from each candidate model.

try:
    # Load FULL TRAIN SET predictions for C1, C2, C3
    full_train_pred_c1 = joblib.load(f'{BASE_DIR}/full_train_pred_c1.pkl')
    full_train_pred_c2 = joblib.load(f'{BASE_DIR}/full_train_pred_c2.pkl')
    full_train_pred_c3 = joblib.load(f'{BASE_DIR}/full_train_pred_c3.pkl')
    
    # 2. Build Meta-Features using FULL TRAIN SET predictions
    meta_features = np.hstack([full_train_pred_c1, full_train_pred_c2, full_train_pred_c3])
    y_full = y # Full labels

    # 3. Final Meta-Learner Training on ALL data (split for internal train/test is no longer needed here, but for safety...
    # We'll re-run the final fit from 01_local_training (assuming OOF/2nd-level data was saved in the notebook)
    # NOTE: Since the full OOF predictions aren't available, we resort to loading the saved meta-learner model.
    
    meta_learner = joblib.load(ENSEMBLE_MODEL_PATH) # Assume the fully-trained meta-learner was saved.
    final_ensemble_method = 'stacked'
    print("Stacked meta-learner loaded successfully.")
    
except FileNotFoundError as e:
    # If meta-learner was NOT saved in 01_local_training, we must train it here (using the full set is technically train leakage!)
    # Best practice is to save the OOF/2nd-level predictions and the meta-learner trained on them.
    print(f"ERROR: {e}")
    print("WARNING: Meta-learner not found. Skipping stacked ensemble.")
    final_ensemble_method = None

# Save decision variables for next cell (copied from original notebook's final decision)
if final_ensemble_method == 'stacked':
    final_ensemble_weights, final_ensemble_method = None, 'stacked'
else:
    # Fallback to Simple Average if Stacked fails to load (using the known failure in the original notebook)
    print("Falling back to Simple Average.")
    final_ensemble_weights, final_ensemble_method = np.array([1/3, 1/3, 1/3]), 'simple'


--- Ensemble Model (Stacked Generalization) Training ---
Stacked meta-learner loaded successfully.


In [None]:
print("\n" + "="*80)
print("GENERATING FINAL ENSEMBLE PREDICTIONS")
print("="*80)

if final_ensemble_method is not None:
    # Use the calibrated test predictions from Step 1
    test_pred_c1 = test_probs_calibrated_c1
    test_pred_c2 = test_pred_c2_calibrated
    test_pred_c3 = test_pred_c3_calibrated
    
    # Safety check on shapes
    if test_pred_c1.shape[0] != len(test) or test_pred_c2.shape[0] != len(test) or test_pred_c3.shape[0] != len(test):
        print("ERROR: Test prediction shapes mismatch. Cannot ensemble.")
        final_ensemble_method = None

if final_ensemble_method is not None:
    print(f"\nUsing ensemble method: {final_ensemble_method.upper()}")
    
    if final_ensemble_method == 'simple':
        final_test_pred = (test_pred_c1 + test_pred_c2 + test_pred_c3) / 3.0
    elif final_ensemble_method in ['optimal', 'weighted']:
        # NOTE: Optimal/Weighted weights are not loaded here, so this case is unlikely to run correctly.
        # The original notebook defaulted to simple average if the best method couldn't be run.
        final_test_pred = (
            final_ensemble_weights[0] * test_pred_c1 +
            final_ensemble_weights[1] * test_pred_c2 +
            final_ensemble_weights[2] * test_pred_c3
        )
    elif final_ensemble_method == 'stacked':
        test_meta_features = np.hstack([test_pred_c1, test_pred_c2, test_pred_c3])
        # Load and use the meta-learner object loaded in the previous cell
        final_test_pred = meta_learner.predict_proba(test_meta_features)
    elif final_ensemble_method == 'rank':
        # Rank averaging logic from 02_kaggle_inference
        def probs_to_ranks_test(probs):
            ranks = np.zeros_like(probs, dtype=int)
            for i in range(probs.shape[0]):
                ranks[i] = np.argsort(np.argsort(-probs[i]))
            return ranks
        ranks_test_c1 = probs_to_ranks_test(test_pred_c1)
        ranks_test_c2 = probs_to_ranks_test(test_pred_c2)
        ranks_test_c3 = probs_to_ranks_test(test_pred_c3)
        avg_ranks_test = (ranks_test_c1 + ranks_test_c2 + ranks_test_c3) / 3.0
        num_classes_test = test_pred_c1.shape[1]
        final_test_pred = np.zeros_like(test_pred_c1, dtype=float)
        for i in range(avg_ranks_test.shape[0]):
            scores = float(num_classes_test) - avg_ranks_test[i]
            scores = np.exp(scores)
            final_test_pred[i] = scores / scores.sum()

    # Safety normalization
    row_sums = final_test_pred.sum(axis=1, keepdims=True)
    final_test_pred = final_test_pred / np.clip(row_sums, 1e-15, None)

    print(f"\nFinal predictions shape: {final_test_pred.shape}")
    print(f"Probability sum check (first 3): {final_test_pred[:3].sum(axis=1)}")

    final_filename = f"submission.csv"
    create_and_save_submission(
        predictions=final_test_pred,
        filename=final_filename,
        test_df=test,
        sample_df=sample
    )

    print("\n" + "="*80)
    print("FINAL ENSEMBLE SUBMISSION CREATED")
    print(f"File: {final_filename}")
    print(f"Method: {final_ensemble_method.upper()}")
    print("="*80)
else:
    print("\nERROR: Ensemble method not determined or prediction failed.")



GENERATING FINAL ENSEMBLE PREDICTIONS

Using ensemble method: STACKED

Final predictions shape: (3, 3)
Probability sum check (first 3): [1. 1. 1.]
Creating submission file: submission.csv...
Successfully saved and verified: submission.csv (Shape: (3, 4))

FINAL ENSEMBLE SUBMISSION CREATED
File: submission.csv
Method: STACKED
Expected Validation LogLoss: ~1.071608
