# Credits goes to @tascj0 [is just copy of his team best solution](https://www.kaggle.com/competitions/lmsys-chatbot-arena/discussion/527685)

# Packages

In [None]:
%pip install /kaggle/input/lmsys-packages/triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [None]:
%pip install /kaggle/input/lmsys-packages/xformers-0.0.24042abc8.d20240802-cp310-cp310-linux_x86_64.whl

In [None]:
!cp -r /kaggle/input/lmsys-modules-0805 human_pref

# Prepare test file

In [None]:
%%writefile prepare_test_file.py
import pandas as pd


df = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")
df["winner_model_a"] = 1
df["winner_model_b"] = 0
df["winner_tie"] = 0
df.to_parquet("test.parquet", index=False)

df["response_a"], df["response_b"] = df["response_b"], df["response_a"]
df.to_parquet("test_swap.parquet", index=False)

In [None]:
!python prepare_test_file.py

# Inference: gemma2-9b

In [None]:
%%writefile predict_m0.py
import torch
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer

# from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
from human_pref.models.modeling_gemma2 import Gemma2ForSequenceClassification
from human_pref.data.processors import ProcessorPAB
from human_pref.data.dataset import LMSYSDataset
from human_pref.data.collators import VarlenCollator, ShardedMaxTokensCollator
from human_pref.utils import to_device


model_name_or_path = "/kaggle/input/lmsys-checkpoints-0-0805"
csv_path = "test.parquet"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
processor = ProcessorPAB(
    tokenizer=tokenizer,
    max_length=4096,
    support_system_role=False,
)
dataset = LMSYSDataset(
    csv_file=csv_path,
    query=None,
    processor=processor,
    include_swap=False,
    is_parquet=True,
)
dataloader = DataLoader(
    dataset,
    batch_size=80,
    num_workers=4,
    collate_fn=ShardedMaxTokensCollator(
        max_tokens=8192, base_collator=VarlenCollator()
    ),
)

# model for pipelined inference
num_hidden_layers = 42
device_map = {
    "model.embed_tokens": "cuda:0",
    "model.norm": "cuda:1",
    "score": "cuda:1",
}
for i in range(num_hidden_layers // 2):
    device_map[f"model.layers.{i}"] = "cuda:0"
for i in range(num_hidden_layers // 2, num_hidden_layers):
    device_map[f"model.layers.{i}"] = "cuda:1"

model = Gemma2ForSequenceClassification.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.float16,
    device_map=device_map,
)

# inv_freq clones for each device
config = model.config
dim = config.head_dim
inv_freq = 1.0 / (
    config.rope_theta ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
)
inv_freq0 = inv_freq.to("cuda:0")
inv_freq1 = inv_freq.to("cuda:1")


# for name, p in model.named_parameters():
#     print(name, p.device)
# for name, b in model.model.named_buffers():
#     print(name, b.device)

# pipeline parallelism with two GPUs
is_first = True
hidden_states = None
outs = []
for batch in tqdm(dataloader):
    for micro_batch in batch:
        input_ids = to_device(micro_batch["input_ids"], "cuda:0")
        seq_info = dict(
            cu_seqlens=micro_batch["cu_seqlens"],
            position_ids=micro_batch["position_ids"],
            max_seq_len=micro_batch["max_seq_len"],
            # attn_bias=BlockDiagonalCausalMask.from_seqlens(micro_batch["seq_lens"]),
        )
        seq_info = to_device(seq_info, "cuda:0")
        if is_first:
            with torch.no_grad(), torch.cuda.amp.autocast():
                prev_hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)
            is_first = False
            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, prev_hidden_states], "cuda:1"
            )
            continue
        with torch.no_grad(), torch.cuda.amp.autocast():
            logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
            hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)

            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, hidden_states], "cuda:1"
            )
            outs.append(logits.cpu())

# last micro-batch
with torch.no_grad(), torch.cuda.amp.autocast():
    logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
    outs.append(logits.cpu())

pred = torch.cat(outs, dim=0)
prob = pred.softmax(-1)
print(dataset.evaluate(prob.numpy()))

np.save('prob_m0.npy', prob)

In [None]:
!python predict_m0.py

# Inference: llama3-8b

In [None]:
%%writefile predict_m3.py
import torch
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer

from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
from human_pref.models.modeling_llama import LlamaForSequenceClassification
from human_pref.data.processors import ProcessorPAB
from human_pref.data.dataset import LMSYSDataset
from human_pref.data.collators import VarlenCollator, ShardedMaxTokensCollator
from human_pref.utils import to_device


model_name_or_path = "/kaggle/input/lmsys-checkpoints-3-0805"
csv_path = "test_swap.parquet"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.deprecation_warnings[
    "sequence-length-is-longer-than-the-specified-maximum"
] = True
processor = ProcessorPAB(
    tokenizer=tokenizer,
    max_length=4096,
    support_system_role=True,
)
dataset = LMSYSDataset(
    csv_file=csv_path,
    query=None,
    processor=processor,
    include_swap=False,
    is_parquet=True,
)
dataloader = DataLoader(
    dataset,
    batch_size=80,
    num_workers=4,
    collate_fn=ShardedMaxTokensCollator(
        max_tokens=8192, base_collator=VarlenCollator()
    ),
)

# model for pipelined inference
num_hidden_layers = 32
device_map = {
    "model.embed_tokens": "cuda:0",
    "model.norm": "cuda:1",
    "score": "cuda:1",
}
for i in range(num_hidden_layers // 2):
    device_map[f"model.layers.{i}"] = "cuda:0"
for i in range(num_hidden_layers // 2, num_hidden_layers):
    device_map[f"model.layers.{i}"] = "cuda:1"

model = LlamaForSequenceClassification.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.float16,
    device_map=device_map,
)

# inv_freq clones for each device
config = model.config
dim = config.hidden_size // config.num_attention_heads
inv_freq = 1.0 / (
    config.rope_theta ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
)
inv_freq0 = inv_freq.to("cuda:0")
inv_freq1 = inv_freq.to("cuda:1")


# for name, p in model.named_parameters():
#     print(name, p.device)
# for name, b in model.model.named_buffers():
#     print(name, b.device)

# pipeline parallelism with two GPUs
is_first = True
hidden_states = None
outs = []
for batch in tqdm(dataloader):
    for micro_batch in batch:
        input_ids = to_device(micro_batch["input_ids"], "cuda:0")
        seq_info = dict(
            cu_seqlens=micro_batch["cu_seqlens"],
            position_ids=micro_batch["position_ids"],
            max_seq_len=micro_batch["max_seq_len"],
            attn_bias=BlockDiagonalCausalMask.from_seqlens(micro_batch["seq_lens"]),
        )
        seq_info = to_device(seq_info, "cuda:0")
        if is_first:
            with torch.no_grad(), torch.cuda.amp.autocast():
                prev_hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)
            is_first = False
            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, prev_hidden_states], "cuda:1"
            )
            continue
        with torch.no_grad(), torch.cuda.amp.autocast():
            logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
            hidden_states = model.forward_part1(input_ids, seq_info, inv_freq0)

            prev_seq_info, prev_hidden_states = to_device(
                [seq_info, hidden_states], "cuda:1"
            )
            outs.append(logits.cpu())

# last micro-batch
with torch.no_grad(), torch.cuda.amp.autocast():
    logits = model.forward_part2(prev_hidden_states, prev_seq_info, inv_freq1)
    outs.append(logits.cpu())


pred = torch.cat(outs, dim=0)
prob = pred.softmax(-1)
print(dataset.evaluate(prob.numpy()))

np.save('prob_m3.npy', prob)

In [None]:
!python predict_m3.py

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
from transformers import set_seed
import ctypes, gc
import torch
import random
import numpy as np

libc = ctypes.CDLL("libc.so.6")
# Seed the same seed to all 
def seed_everything(seed=42):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    set_seed(seed)
    
def clear_memory():
    libc.malloc_trim(0)
    torch.cuda.empty_cache()
    gc.collect()

SEED = 42
seed_everything(SEED)
# Set the GPUs
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import log_loss
import joblib

In [None]:
train = pd.read_csv(r'/kaggle/input/llm-classification-finetuning/train.csv')
test = pd.read_csv(r'/kaggle/input/llm-classification-finetuning/test.csv')
sample_submission = pd.read_csv(r'/kaggle/input/llm-classification-finetuning/sample_submission.csv')

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import hstack
from lightgbm import LGBMClassifier
import gc

y = train[['winner_model_a', 'winner_model_b', 'winner_tie']].idxmax(axis=1)
y = y.map({
    'winner_model_a': 0,
    'winner_model_b': 1,
    'winner_tie': 2
})

df_fit = test if len(test) > 3 else train

tfidf_prompt = TfidfVectorizer(
    max_features=500,
    stop_words='english',
    min_df=0.002,
    ngram_range=(1, 3)
)
count_prompt = CountVectorizer(
    max_features=500,
    stop_words='english',
    min_df=0.002,
    ngram_range=(1, 3)
)

tfidf_response = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    min_df=0.002,
    ngram_range=(1, 3)
)
count_response = CountVectorizer(
    max_features=1000,
    stop_words='english',
    min_df=0.002,
    ngram_range=(1, 3)
)

tfidf_prompt_char = TfidfVectorizer(
    analyzer='char',
    ngram_range=(1, 3),
    max_features=500,
    min_df=0.002
)
count_prompt_char = CountVectorizer(
    analyzer='char',
    ngram_range=(1, 3),
    max_features=500,
    min_df=0.002
)

tfidf_response_char = TfidfVectorizer(
    analyzer='char',
    ngram_range=(1, 3),
    max_features=1000,
    min_df=0.002
)
count_response_char = CountVectorizer(
    analyzer='char',
    ngram_range=(1, 3),
    max_features=1000,
    min_df=0.002
)

tfidf_prompt.fit(df_fit['prompt'])
count_prompt.fit(df_fit['prompt'])

tfidf_response.fit(pd.concat([df_fit['response_a'], df_fit['response_b']]))
count_response.fit(pd.concat([df_fit['response_a'], df_fit['response_b']]))

tfidf_prompt_char.fit(df_fit['prompt'])
count_prompt_char.fit(df_fit['prompt'])

tfidf_response_char.fit(pd.concat([df_fit['response_a'], df_fit['response_b']]))
count_response_char.fit(pd.concat([df_fit['response_a'], df_fit['response_b']]))

def get_features(df):
    X_prompt_tfidf = tfidf_prompt.transform(df['prompt'])
    X_prompt_count = count_prompt.transform(df['prompt'])
    
    X_prompt_tfidf_char = tfidf_prompt_char.transform(df['prompt'])
    X_prompt_count_char = count_prompt_char.transform(df['prompt'])
    
    X_prompt_combined = hstack([
        X_prompt_tfidf, 
        X_prompt_count, 
        X_prompt_tfidf_char, 
        X_prompt_count_char
    ])
    
    response_combined = pd.concat([df['response_a'], df['response_b']], axis=0)
    X_response_tfidf = tfidf_response.transform(response_combined)
    X_response_count = count_response.transform(response_combined)
    
    X_response_tfidf_char = tfidf_response_char.transform(response_combined)
    X_response_count_char = count_response_char.transform(response_combined)
    
    n = len(df)
    X_response_a_tfidf = X_response_tfidf[:n]
    X_response_b_tfidf = X_response_tfidf[n:]
    
    X_response_a_count = X_response_count[:n]
    X_response_b_count = X_response_count[n:]
    
    X_response_a_tfidf_char = X_response_tfidf_char[:n]
    X_response_b_tfidf_char = X_response_tfidf_char[n:]
    
    X_response_a_count_char = X_response_count_char[:n]
    X_response_b_count_char = X_response_count_char[n:]
    
    afeat = hstack([
        X_response_a_tfidf, 
        X_response_a_count,
        X_response_a_tfidf_char,
        X_response_a_count_char
    ])
    bfeat = hstack([
        X_response_b_tfidf, 
        X_response_b_count,
        X_response_b_tfidf_char,
        X_response_b_count_char
    ])
    
    v = hstack([
        afeat,
        bfeat
    ])
    
    extras = []
    EXTRAS = ['\n', '\n\n', '.', ' ', '","']
    for e in EXTRAS:
        for c in ['prompt', 'response_a', 'response_b']:
            extras.append(df[c].str.count(e).values)
    
    extras.append(df['prompt'].str.len().values)
    extras.append(df['prompt'].str.split().apply(lambda x: len(x)).values)
    
    extras = np.stack(extras, axis=1)
    extras = np.hstack([extras ** 0.5, np.log1p(extras)])
    
    final_features = hstack([v, extras, X_prompt_combined])
    return final_features.tocsr()

In [None]:
X_train = get_features(train)
X_test = get_features(test)

In [None]:
def load_models_and_weights(save_path='/kaggle/input/model-and-weights-more-param/models_and_weights_more_param.joblib'):
    models_and_weights = joblib.load(save_path)
    models = models_and_weights['models']
    weights = models_and_weights['weights']
    print(f'Loaded {len(models)} models with corresponding weights.')
    return models, weights

def weighted_predict(models, weights, X_test):
    test_preds = np.zeros((X_test.shape[0], 3))
    for model, weight in zip(models, weights):
        y_pred = model.predict_proba(X_test)
        test_preds += y_pred * weight
    test_preds_weighted = test_preds / np.sum(weights)
    return test_preds_weighted

models, weights = load_models_and_weights(save_path='/kaggle/input/model-and-weights-more-param/models_and_weights_more_param.joblib')

y_test_proba = weighted_predict(models, weights, X_test)

# Make submission

In [None]:
import numpy as np
import pandas as pd

df = pd.read_parquet("test.parquet")
preds = np.average(
    [
        np.load("prob_m0.npy"),
        np.load("prob_m3.npy")[:, [1, 0, 2]],
        y_test_proba
    ],
    axis=0,
    weights=[50, 42, 0.5],
)
sub = pd.DataFrame({
    "id": df["id"],
    "winner_model_a": preds[:, 0],
    "winner_model_b": preds[:, 1],
    "winner_tie": preds[:, 2],
})
sub.to_csv("submission.csv", index=False)
print(sub.head())