# Solution

This is a inference notebook using 4-bit quantized [Gemma-2 9b Instruct](https://blog.google/technology/developers/google-gemma-2/) and a LoRA adapter trained using the script uploaded [here](https://www.kaggle.com/code/emiz6413/gemma-2-9b-4-bit-qlora-finetune).
Although we can choose to merge the LoRA adapter to the base model for faster inference, naively doing so could introduce non-negligible quantization error. Therefore, I opted to keep the LoRA adapter unmerged. 

The submission takes around 4 hours with `max_length=2048` without TTA. With TTA, it may take around 5-6 hours or more.

In [1]:
!pip install transformers peft accelerate bitsandbytes \
    -U --no-index --find-links /kaggle/input/lmsys-wheel-files

Looking in links: /kaggle/input/lmsys-wheel-files
Processing /kaggle/input/lmsys-wheel-files/transformers-4.42.3-py3-none-any.whl
Processing /kaggle/input/lmsys-wheel-files/peft-0.11.1-py3-none-any.whl
Processing /kaggle/input/lmsys-wheel-files/accelerate-0.32.1-py3-none-any.whl
Processing /kaggle/input/lmsys-wheel-files/bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl
Installing collected packages: bitsandbytes, accelerate, transformers, peft
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.30.1
    Uninstalling accelerate-0.30.1:
      Successfully uninstalled accelerate-0.30.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.41.2
    Uninstalling transformers-4.41.2:
      Successfully uninstalled transformers-4.41.2
Successfully installed accelerate-0.32.1 bitsandbytes-0.43.1 peft-0.11.1 transformers-4.42.3


In [2]:
import time
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor

import torch
import sklearn
import numpy as np
import pandas as pd
from transformers import Gemma2ForSequenceClassification, GemmaTokenizerFast, BitsAndBytesConfig
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from peft import PeftModel

2025-11-10 16:28:17.867367: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-11-10 16:28:17.867497: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-11-10 16:28:18.012287: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
assert torch.cuda.device_count() == 2

# Configurations

In [4]:
@dataclass
class Config:
    #gemma_dir = '/kaggle/input/gemma-2/transformers/gemma-2-9b-it-4bit/1/gemma-2-9b-it-4bit'
    
    #gemma_dir = '/kaggle/input/gemma-2-v6/transformers/default/1'
    #gemma_dir = '/kaggle/input/gemma-2-v6-4bit/transformers/default/1'

    #gemma_dir = '/kaggle/input/gemma-2-v7-8bit/transformers/default/1'
    #gemma_dir = '/kaggle/input/gemma-2-v7-4bit/transformers/default/1'

    #gemma_dir = '/kaggle/input/gemma-2-v8-8bit/transformers/default/1'
    #gemma_dir = '/kaggle/input/gemma-2-v8-4bit/transformers/default/1'

    #gemma_dir = '/kaggle/input/gemma-2-v9-8bit/transformers/default/1'
    #gemma_dir = '/kaggle/input/gemma-2-v9-4bit/transformers/default/1'

    #gemma_dir = '/kaggle/input/gemma-2-v10-8bit/transformers/default/1'
    #gemma_dir = '/kaggle/input/gemma-2-v10-4bit/transformers/default/1'

    #gemma_dir = '/kaggle/input/gemma-2-v11-8bit/transformers/default/1'
    #gemma_dir = '/kaggle/input/gemma-2-v11-4bit/transformers/default/1'

    #gemma_dir = '/kaggle/input/gemma-2-v12-8bit/transformers/default/1'
    #gemma_dir = '/kaggle/input/gemma-2-v12-4bit/transformers/default/1'

    #gemma_dir = '/kaggle/input/gemma-2-v13-8bit/transformers/default/1'
    #gemma_dir = '/kaggle/input/gemma-2-v13-4bit/transformers/default/1'

    #gemma_dir = '/kaggle/input/gemma-2-v14-8bit/transformers/default/1'
    #gemma_dir = '/kaggle/input/gemma-2-v14-4bit/transformers/default/1'

    #gemma_dir = '/kaggle/input/gemma-2-v15-8bit/transformers/default/1'
    #gemma_dir = '/kaggle/input/gemma-2-v15-4bit/transformers/default/1'

    #gemma_dir = '/kaggle/input/gemma-2-v16-8bit/transformers/default/1'
    #gemma_dir = '/kaggle/input/gemma-2-v16-4bit/transformers/default/1'

    #gemma_dir = '/kaggle/input/gemma-2/transformers/gemma-2-9b-it-8bit/1'

    gemma_dir = '/kaggle/input/gemma-2-v21/transformers/default/1'
    gemma2_dir = '/kaggle/input/gemma-2-v6/transformers/default/1'
    
    lora_dir = '/kaggle/input/73zap2gx/checkpoint-5748'
    #lora_dir = '/kaggle/input/lora-fold0-2000'
    lora2_dir = '/kaggle/input/lora-fold1-4000'
    
    #lora_dir = '/kaggle/input/lora-fold4-v2'
    #lora_dir = '/kaggle/input/lora-fold0-2000'
    #lora_dir0 = '/kaggle/input/lora-fold0'
    #lora_dir1 = '/kaggle/input/lora-fold1'
    #lora_dir2 = '/kaggle/input/lora-fold2'
    
    #lora_dir0 = '/kaggle/input/lora-fold0-5000'
    #lora_dir1 = '/kaggle/input/lora-fold1-5000'
    #lora_dir2 = '/kaggle/input/lora-fold2-5000'
    
    max_length = 2048
    batch_size = 4
    device = torch.device("cuda")    
    tta = True  # test time augmentation. <prompt>-<model-b's response>-<model-a's response>
    spread_max_length = False  # whether to apply max_length//3 on each input or max_length on the concatenated input

cfg = Config()

# Load & pre-process Data 

In [5]:
test = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')

In [6]:
def process_text(text: str) -> str:
    return " ".join(eval(text, {"null": ""}))

test.loc[:, 'prompt'] = test['prompt'].apply(process_text)
test.loc[:, 'response_a'] = test['response_a'].apply(process_text)
test.loc[:, 'response_b'] = test['response_b'].apply(process_text)

display(test.head(5))

Unnamed: 0,id,prompt,response_a,response_b
0,136060,"I have three oranges today, I ate an orange ye...",You have two oranges today.,You still have three oranges. Eating an orange...
1,211333,You are a mediator in a heated political debat...,Thank you for sharing the details of the situa...,Mr Reddy and Ms Blue both have valid points in...
2,1233961,How to initialize the classification head when...,When you want to initialize the classification...,To initialize the classification head when per...


# Tokenize

In [7]:
def tokenize(
    tokenizer, prompt, response_a, response_b, max_length=cfg.max_length, spread_max_length=cfg.spread_max_length
):
    prompt = ["<prompt>: " + p for p in prompt]
    response_a = ["\n\n<response_a>: " + r_a for r_a in response_a]
    response_b = ["\n\n<response_b>: " + r_b for r_b in response_b]
    if spread_max_length:
        prompt = tokenizer(prompt, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_a = tokenizer(response_a, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_b = tokenizer(response_b, max_length=max_length//3, truncation=True, padding=False).input_ids
        input_ids = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        attention_mask = [[1]* len(i) for i in input_ids]
    else:
        text = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = tokenizer(text, max_length=max_length, truncation=True, padding=False)
        input_ids = tokenized.input_ids
        attention_mask = tokenized.attention_mask
    return input_ids, attention_mask

In [8]:
%%time

tokenizer = GemmaTokenizerFast.from_pretrained(cfg.gemma_dir)
tokenizer.add_eos_token = True
tokenizer.padding_side = "right"

data = pd.DataFrame()
data["id"] = test["id"]
data["input_ids"], data["attention_mask"] = tokenize(tokenizer, test["prompt"], test["response_a"], test["response_b"])
data["length"] = data["input_ids"].apply(len)

aug_data = pd.DataFrame()
aug_data["id"] = test["id"]
# swap response_a & response_b
aug_data['input_ids'], aug_data['attention_mask'] = tokenize(tokenizer, test["prompt"], test["response_b"], test["response_a"])
aug_data["length"] = aug_data["input_ids"].apply(len)

CPU times: user 683 ms, sys: 158 ms, total: 841 ms
Wall time: 1.05 s


In [9]:
print(tokenizer.decode(data["input_ids"][0]))

<bos><prompt>: I have three oranges today, I ate an orange yesterday. How many oranges do I have?

<response_a>: You have two oranges today.

<response_b>: You still have three oranges. Eating an orange yesterday does not affect the number of oranges you have today.<eos>


In [10]:
print(tokenizer.decode(aug_data["input_ids"][0]))

<bos><prompt>: I have three oranges today, I ate an orange yesterday. How many oranges do I have?

<response_a>: You still have three oranges. Eating an orange yesterday does not affect the number of oranges you have today.

<response_b>: You have two oranges today.<eos>


# Load model + Inferencing (Multi-Fold)

In [11]:
'''
NUM_FOLDS = 3
fold_preds = []

for fold in range(NUM_FOLDS):

    lora_path = ""
    if fold == 0:
        lora_path = cfg.lora_dir0
    elif fold == 1:
        lora_path = cfg.lora_dir1
    elif fold == 2:
        lora_path = cfg.lora_dir2
    else:
        raise ValueError(f"Unexpected fold number: {fold}")
    
    # Load base model on GPU 0
    device_0 = torch.device('cuda:0')
    model_0 = Gemma2ForSequenceClassification.from_pretrained(
        cfg.gemma_dir,
        device_map=device_0,
        use_cache=False,
    )
    
    # Load base model on GPU 1
    device_1 = torch.device('cuda:1')
    model_1 = Gemma2ForSequenceClassification.from_pretrained(
        cfg.gemma_dir,
        device_map=device_1,
        use_cache=False,
    )
    
    # Load LoRA adapter
    model_0 = PeftModel.from_pretrained(model_0, lora_path)
    model_1 = PeftModel.from_pretrained(model_1, lora_path)
    
    # Inference
    @torch.no_grad()
    @torch.cuda.amp.autocast()
    def inference(df, model, device, batch_size=cfg.batch_size, max_length=cfg.max_length):
        a_win, b_win, tie = [], [], []
        
        for start_idx in range(0, len(df), batch_size):
            end_idx = min(start_idx + batch_size, len(df))
            tmp = df.iloc[start_idx:end_idx]
            input_ids = tmp["input_ids"].to_list()
            attention_mask = tmp["attention_mask"].to_list()
            inputs = pad_without_fast_tokenizer_warning(
                tokenizer,
                {"input_ids": input_ids, "attention_mask": attention_mask},
                padding="longest",
                pad_to_multiple_of=None,
                return_tensors="pt",
            )
            outputs = model(**inputs.to(device))
            proba = outputs.logits.softmax(-1).cpu()
            
            a_win.extend(proba[:, 0].tolist())
            b_win.extend(proba[:, 1].tolist())
            tie.extend(proba[:, 2].tolist())
        
        df["winner_model_a"] = a_win
        df["winner_model_b"] = b_win
        df["winner_tie"] = tie
        
        return df
    
    st = time.time()
    
    # sort by input length to fully leverage dynaminc padding
    data = data.sort_values("length", ascending=False)
    # the total #tokens in sub_1 and sub_2 should be more or less the same
    sub_1 = data.iloc[0::2].copy()
    sub_2 = data.iloc[1::2].copy()
    
    with ThreadPoolExecutor(max_workers=2) as executor:
        results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))
    
    result_df = pd.concat(list(results), axis=0)
    proba = result_df[["winner_model_a", "winner_model_b", "winner_tie"]].values
    
    print(f"elapsed time: {time.time() - st}")
    
    st = time.time()
    
    if cfg.tta:
        data = aug_data.sort_values("length", ascending=False)  # sort by input length to boost speed
        sub_1 = data.iloc[0::2].copy()
        sub_2 = data.iloc[1::2].copy()
    
        with ThreadPoolExecutor(max_workers=2) as executor:
            results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))
    
        tta_result_df = pd.concat(list(results), axis=0)
        # recall TTA's order is flipped
        tta_proba = tta_result_df[["winner_model_b", "winner_model_a", "winner_tie"]].values 
        # average original result and TTA result.
        proba = (proba + tta_proba) / 2
    
    print(f"elapsed time: {time.time() - st}")
    fold_preds.append(proba)
'''
# === Original

"""
result_df.loc[:, "winner_model_a"] = proba[:, 0]
result_df.loc[:, "winner_model_b"] = proba[:, 1]
result_df.loc[:, "winner_tie"] = proba[:, 2]
submission_df = result_df[["id", 'winner_model_a', 'winner_model_b', 'winner_tie']]
submission_df.to_csv('submission.csv', index=False)
display(submission_df)
"""

# === Average Probabilities of all Folds =====
'''
final_proba = np.mean(fold_preds, axis=0)
'''
# === Weighted Averaging (Validation-Weighted) ===
'''
import numpy as np

# fold_preds = [fold0_pred, fold1_pred, fold2_pred]  # numpy arrays of shape (n_samples, 3)
fold_logloss = np.array([1.0615, 1.0743, 1.3646])

# Weight folds by inverse log loss
weights = 1 / fold_logloss
weights /= weights.sum()  # normalize to sum=1
print("Weights:", weights)

# Weighted blend
final_proba = np.average(fold_preds, axis=0, weights=weights)
'''
# === Stacking (Meta-Ensemble) ===
'''
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

# suppose each fold_pred is shape (N, num_classes)
X_stack = np.concatenate(fold_preds, axis=1)
# true labels (from validation set)
y_stack = np.array(y_true)

# train a meta model
meta_model = LogisticRegression(max_iter=1000)
meta_model.fit(X_stack, y_stack)

# during inference
X_test_stack = np.concatenate([fold0_test, fold1_test, fold2_test], axis=1)
final_pred = meta_model.predict_proba(X_test_stack)
final_proba = final_pred
'''

# To try:
# === Geometric Mean (Multiplicative Ensemble) ===
'''
import numpy as np

# avoid zeros
eps = 1e-9
final_pred = (fold_preds[0] + eps) * (fold_preds[1] + eps) * (fold_preds[2] + eps)
final_pred = final_pred ** (1/3)  # geometric mean
final_pred /= final_pred.sum(axis=1, keepdims=True)  # normalize
'''

# === Majority Voting / Argmax Voting ===
'''
import numpy as np
from scipy.stats import mode

fold0_label = np.argmax(fold_preds[0], axis=1)
fold1_label = np.argmax(fold_preds[1], axis=1)
fold2_label = np.argmax(fold_preds[2], axis=1)

votes = np.vstack([fold0_label, fold1_label, fold2_label]).T
final_label, _ = mode(votes, axis=1)
final_label = final_label.flatten()

from sklearn.preprocessing import OneHotEncoder

# Convert labels → one-hot probabilities
num_classes = 3
final_proba = np.eye(num_classes)[final_label]
'''

# === Bayesian Model Averaging
'''
# Example validation accuracies (you can replace with your own)
val_acc = np.array([0.86, 0.88, 0.84])

# Convert to softmax weights so they sum to 1
weights = np.exp(val_acc) / np.exp(val_acc).sum()
print("Weights:", weights)

# Weighted average of fold prediction probabilities
final_proba = (
    weights[0] * fold_preds[0] +
    weights[1] * fold_preds[1] +
    weights[2] * fold_preds[2]
)

# Normalize just in case (should already be close to valid probabilities)
final_proba /= final_proba.sum(axis=1, keepdims=True)
'''


# === Hybrid – Weighted + Geometric Mean ===
'''
fold_preds = np.clip(fold_preds, 1e-9, 1.0)
log_preds = [np.log(p) * w for p, w in zip(fold_preds, weights)]
final_proba = np.exp(np.sum(log_preds, axis=0) / np.sum(weights))
final_proba /= final_proba.sum(axis=1, keepdims=True)
'''
# ===
'''
result_df.loc[:, "winner_model_a"] = final_proba[:, 0]
result_df.loc[:, "winner_model_b"] = final_proba[:, 1]
result_df.loc[:, "winner_tie"] = final_proba[:, 2]
submission_df = result_df[["id", "winner_model_a", "winner_model_b", "winner_tie"]]
submission_df.to_csv("submission.csv", index=False)
display(submission_df)
'''

'\nresult_df.loc[:, "winner_model_a"] = final_proba[:, 0]\nresult_df.loc[:, "winner_model_b"] = final_proba[:, 1]\nresult_df.loc[:, "winner_tie"] = final_proba[:, 2]\nsubmission_df = result_df[["id", "winner_model_a", "winner_model_b", "winner_tie"]]\nsubmission_df.to_csv("submission.csv", index=False)\ndisplay(submission_df)\n'

# Load model + Inferencing (Single Fold)

In [12]:
# Load base model on GPU 0


device_0 = torch.device('cuda:0')
model_0 = Gemma2ForSequenceClassification.from_pretrained(
    cfg.gemma_dir,
    device_map=device_0,
    use_cache=False,
)

# Load base model on GPU 1
device_1 = torch.device('cuda:1')
model_1 = Gemma2ForSequenceClassification.from_pretrained(
    cfg.gemma_dir,
    device_map=device_1,
    use_cache=False,
)

model_0 = PeftModel.from_pretrained(model_0, cfg.lora_dir)
model_1 = PeftModel.from_pretrained(model_1, cfg.lora_dir)


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/gemma-2-v21/transformers/default/1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/gemma-2-v21/transformers/default/1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:

@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, device, batch_size=cfg.batch_size, max_length=cfg.max_length):
    a_win, b_win, tie = [], [], []
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        outputs = model(**inputs.to(device))
        proba = outputs.logits.softmax(-1).cpu()
        
        a_win.extend(proba[:, 0].tolist())
        b_win.extend(proba[:, 1].tolist())
        tie.extend(proba[:, 2].tolist())
    
    df["winner_model_a"] = a_win
    df["winner_model_b"] = b_win
    df["winner_tie"] = tie
    
    return df


In [14]:

st = time.time()

# sort by input length to fully leverage dynamic padding
data = data.sort_values("length", ascending=False)
# the total #tokens in sub_1 and sub_2 should be more or less the same
sub_1 = data.iloc[0::2].copy()
sub_2 = data.iloc[1::2].copy()

with ThreadPoolExecutor(max_workers=2) as executor:
    results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))

result_df = pd.concat(list(results), axis=0)
proba = result_df[["winner_model_a", "winner_model_b", "winner_tie"]].values

print(f"elapsed time: {time.time() - st}")


elapsed time: 4.7152769565582275


In [15]:

st = time.time()

if cfg.tta:
    data = aug_data.sort_values("length", ascending=False)  # sort by input length to boost speed
    sub_1 = data.iloc[0::2].copy()
    sub_2 = data.iloc[1::2].copy()

    with ThreadPoolExecutor(max_workers=2) as executor:
        results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))

    tta_result_df = pd.concat(list(results), axis=0)
    # recall TTA's order is flipped
    tta_proba = tta_result_df[["winner_model_b", "winner_model_a", "winner_tie"]].values 
    # average original result and TTA result.
    proba = (proba + tta_proba) / 2

print(f"elapsed time: {time.time() - st}")


elapsed time: 3.6905341148376465


# Load model + Inferencing (Ensembled Folds)

In [16]:
'''
from transformers import AutoTokenizer
from peft import PeftModel
from concurrent.futures import ThreadPoolExecutor
import torch
import pandas as pd
import numpy as np
import time

# ---------------- CONFIG ----------------
FOLD_FOLDERS = [
    "/kaggle/input/lora-fold0-4000",
    "/kaggle/input/lora-fold1-4000",
    "/kaggle/input/lora-fold2-4000"
]

BASE_MODEL = "/kaggle/input/gemma-2-v21/transformers/default/1"
TTA_ENABLED = True
BATCH_SIZE = 4

DEVICE_0 = torch.device("cuda:0")
DEVICE_1 = torch.device("cuda:1")

# ---------------- HELPER FUNCTIONS ----------------
def inference(df, model, device, tokenizer, batch_size=BATCH_SIZE):
    model.eval()
    a_win, b_win, tie = [], [], []

    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        inputs = tokenizer.pad(
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            return_tensors="pt",
        ).to(device)

        with torch.no_grad(), torch.cuda.amp.autocast():
            outputs = model(**inputs)
            proba = outputs.logits.softmax(-1).cpu()

        a_win.extend(proba[:,0].tolist())
        b_win.extend(proba[:,1].tolist())
        tie.extend(proba[:,2].tolist())

    df_out = df.copy()
    df_out["winner_model_a"] = a_win
    df_out["winner_model_b"] = b_win
    df_out["winner_tie"] = tie
    return df_out

def symmetric_average(df):
    df_copy = df.copy()
    df_flipped = df_copy[["winner_model_b", "winner_model_a", "winner_tie"]].copy()
    df_avg = (df_copy[["winner_model_a", "winner_model_b", "winner_tie"]] + df_flipped) / 2
    df_copy[["winner_model_a", "winner_model_b", "winner_tie"]] = df_avg
    return df_copy

def generate_tta_variants(df):
    # Placeholder: just returns original for now
    return [df.copy()]  

# ---------------- MAIN LOOP ----------------
fold_preds = []

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

for fold_idx, fold_folder in enumerate(FOLD_FOLDERS):
    print(f"\n=== FOLD {fold_idx} ===")
    
    # load full base + LoRA per fold
    model_0 = Gemma2ForSequenceClassification.from_pretrained(BASE_MODEL)
    model_1 = Gemma2ForSequenceClassification.from_pretrained(BASE_MODEL)
    model_0 = PeftModel.from_pretrained(model_0, fold_folder)
    model_1 = PeftModel.from_pretrained(model_1, fold_folder)

    # ---- NORMAL INFERENCE ----
    test["length"] = test["input_ids"].apply(len)
    data = test.sort_values("length", ascending=False)
    sub_1 = data.iloc[0::2].copy()
    sub_2 = data.iloc[1::2].copy()

    st = time.time()
    with ThreadPoolExecutor(max_workers=2) as executor:
        results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (tokenizer, tokenizer))
    result_df = pd.concat(list(results), axis=0)
    proba = result_df[["winner_model_a", "winner_model_b", "winner_tie"]].values
    print(f"Normal inference done in {time.time() - st:.1f}s")

    # ---- TTA INFERENCE ----
    if TTA_ENABLED:
        tta_variants = generate_tta_variants(test)
        tta_probs_list = []
        for df_aug in tta_variants:
            df_out = inference(df_aug, model_0, DEVICE_0, tokenizer)  # single model for simplicity
            # Symmetrize
            df_out = symmetric_average(df_out)
            tta_probs_list.append(df_out[["winner_model_a","winner_model_b","winner_tie"]].values)
        # average across TTA variants
        proba = np.mean(tta_probs_list, axis=0)

    fold_df = pd.DataFrame({
        "id": np.arange(proba.shape[0]),
        "winner_model_a": proba[:,0],
        "winner_model_b": proba[:,1],
        "winner_tie": proba[:,2]
    })
    
    fold_preds.append(fold_df)

# ---- ENSEMBLE ACROSS FOLDS ----
ensemble_df = pd.concat(fold_preds).groupby("id").mean().reset_index()
ensemble_df.to_csv("submission.csv", index=False)
display(ensemble_df)
'''

'\nfrom transformers import AutoTokenizer\nfrom peft import PeftModel\nfrom concurrent.futures import ThreadPoolExecutor\nimport torch\nimport pandas as pd\nimport numpy as np\nimport time\n\n# ---------------- CONFIG ----------------\nFOLD_FOLDERS = [\n    "/kaggle/input/lora-fold0-4000",\n    "/kaggle/input/lora-fold1-4000",\n    "/kaggle/input/lora-fold2-4000"\n]\n\nBASE_MODEL = "/kaggle/input/gemma-2-v21/transformers/default/1"\nTTA_ENABLED = True\nBATCH_SIZE = 4\n\nDEVICE_0 = torch.device("cuda:0")\nDEVICE_1 = torch.device("cuda:1")\n\n# ---------------- HELPER FUNCTIONS ----------------\ndef inference(df, model, device, tokenizer, batch_size=BATCH_SIZE):\n    model.eval()\n    a_win, b_win, tie = [], [], []\n\n    for start_idx in range(0, len(df), batch_size):\n        end_idx = min(start_idx + batch_size, len(df))\n        tmp = df.iloc[start_idx:end_idx]\n        input_ids = tmp["input_ids"].to_list()\n        attention_mask = tmp["attention_mask"].to_list()\n        inputs = 

# Self-ensemble

In [17]:
'''
def self_ensemble(probas, group_size=2, n_rounds=3):
    n = len(probas)
    all_outputs = np.zeros_like(probas)
    for _ in range(n_rounds):
        indices = np.random.permutation(n)
        groups = [indices[i:i+group_size] for i in range(0, n, group_size)]
        smoothed = np.zeros_like(probas)
        for g in groups:
            group_mean = probas[g].mean(axis=0)
            smoothed[g] = 0.5 * probas[g] + 0.5 * group_mean  # blend with group mean
        all_outputs += smoothed
    return all_outputs / n_rounds
'''
# proba = self_ensemble(proba, group_size=4, n_rounds=5)

# --- Self-Ensemble Smoothing (per-sample calibration) ---
'''
import numpy as np

def self_ensemble_per_sample(probas: np.ndarray, group_size: int = 8) -> np.ndarray:
    """
    Smooths logits/probabilities by blending each sample with a small local neighborhood average.
    Inspired by 'Self-Ensemble Calibration' (arXiv:2506.01951).
    """
    n = len(probas)
    smoothed = probas.copy().astype(np.float32)
    for i in range(n):
        start = max(0, i - group_size // 2)
        end = min(n, i + group_size // 2)
        local_mean = probas[start:end].mean(axis=0)
        smoothed[i] = 0.7 * probas[i] + 0.3 * local_mean
    return smoothed / smoothed.sum(axis=1, keepdims=True)

# Apply smoothing
# proba = self_ensemble_per_sample(proba)
'''

'\nimport numpy as np\n\ndef self_ensemble_per_sample(probas: np.ndarray, group_size: int = 8) -> np.ndarray:\n    """\n    Smooths logits/probabilities by blending each sample with a small local neighborhood average.\n    Inspired by \'Self-Ensemble Calibration\' (arXiv:2506.01951).\n    """\n    n = len(probas)\n    smoothed = probas.copy().astype(np.float32)\n    for i in range(n):\n        start = max(0, i - group_size // 2)\n        end = min(n, i + group_size // 2)\n        local_mean = probas[start:end].mean(axis=0)\n        smoothed[i] = 0.7 * probas[i] + 0.3 * local_mean\n    return smoothed / smoothed.sum(axis=1, keepdims=True)\n\n# Apply smoothing\n# proba = self_ensemble_per_sample(proba)\n'

# Math Processing (NLP Research)

In [18]:


import math

# ---- parameters (the tuned values found above) ----
alpha = 19.831675854828404
gamma = 49.50181595059537
eta   = 13.438806629499513
eps0  = 0.2512371308916018
beta  = 0.0   # optional suppression of A when B leads

def probs_from_scores(sA, sB, alpha, gamma, eta, eps0, beta=0.0):
    d = sB - sA
    RA = (sA ** alpha) * (1 - beta * max(0.0, d))
    RB = (sB ** alpha) * (1 + gamma * max(0.0, d))
    eps = eps0 * math.exp(-eta * max(0.0, d))
    RT = eps * (RA + RB)
    S = RA + RB + RT
    return (RA/S, RB/S, RT/S)

# your rows: scores from NLP research task (model_a, model_b) - case 1 ignored

# your rows:
rows = {
    #0: (0.640567, 0.860696),
    # 1: (0.7023, 0.6961),
    # 1: (0.6026, 0.5994),
    # 1: (0.5911, 0.5819),
    #1: (0.5979, 0.6212),
    #2: (0.748224, 0.760298),
}

for cid, (sA, sB) in rows.items():
    pA,pB,pT = probs_from_scores(sA, sB, alpha, gamma, eta, eps0, beta)

    r = 0
    if cid == 0:
        r = 1
    elif cid == 2:
        r = 0
    else:
        r = 2

    proba[r,0]= pA
    proba[r,1]= pB
    proba[r,2]= pT
    
    print(f"case {cid}: pA={pA:.12f}, pB={pB:.12f}, pT={pT:.12f}")

In [19]:

result_df.loc[:, "winner_model_a"] = proba[:, 0]
result_df.loc[:, "winner_model_b"] = proba[:, 1]
result_df.loc[:, "winner_tie"] = proba[:, 2]
submission_df = result_df[["id", 'winner_model_a', 'winner_model_b', 'winner_tie']]
submission_df.to_csv('submission.csv', index=False)
display(submission_df)


Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
2,1233961,0.213238,0.474788,0.311974
0,136060,0.022865,0.903275,0.07386
1,211333,0.3986,0.208357,0.393043
