In [1]:
import time
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor

import torch
import sklearn
import numpy as np
import pandas as pd
from transformers import Gemma2ForSequenceClassification, GemmaTokenizerFast, BitsAndBytesConfig
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from peft import PeftModel

In [2]:
from dataclasses import dataclass, field
import torch

# Define the Config class with the specified attributes and values
@dataclass
class Config:
    gemma_dir: str = '/kaggle/input/gemma-2/transformers/gemma-2-9b-it-4bit/1/gemma-2-9b-it-4bit'
    lora_dir: str = '/kaggle/input/73zap2gx/checkpoint-5748'
    max_length: int = 2048
    batch_size: int = 2
    device: torch.device = field(default_factory=lambda: torch.device("cuda"))
    tta: bool = False
    spread_max_length: bool = False

In [3]:
# Set the config variable based on the LOCAL flag
LOCAL = True  # Set this flag accordingly

if LOCAL:
    cfg = Config(
        gemma_dir='google/gemma-2-9b-it',
        lora_dir='./output/gemma-seq-ittokens/checkpoint-1436',
    )
    test_path = "./data/train.csv"
    test = pd.read_csv(test_path)
    # Select where index % config.n_splits (5) == 0
    test = test[test.index % 5 == 0]
    # Reset the index
    test.reset_index(drop=True, inplace=True)
else:
    cfg = Config()
    test_path = "/kaggle/input/lmsys-chatbot-arena/test.csv"
    test = pd.read_csv(test_path)

In [4]:
test

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,292873,vicuna-13b,gpt-4-0314,"[""Construct a rap battle, in the style of Epic...","[""[Zeus]\nYo, it's the king of the gods on the...","[""(Verse 1 - Zeus)\n\nI'm the king of the gods...",0,1,0
2,497862,vicuna-7b,guanaco-33b,"[""write a python function to upload all the fi...","[""Sure, here's a Python function that uses the...","[""Sure, here's an example function in Python t...",0,1,0
3,863398,llama2-70b-steerlm-chat,gemini-pro-dev-api,"[""what do you know about real estate""]","[""Real estate refers to the land and buildings...","[""1. **Types of Real Estate:**\n * **Reside...",0,1,0
4,1256092,claude-2.1,vicuna-13b,"[""Write a python code that calculates sum of 5...","[""Here is the python code that calculates the ...","[""Here is a Python code that calculates the su...",0,0,1
...,...,...,...,...,...,...,...,...,...
11491,4293543403,qwen1.5-7b-chat,mixtral-8x7b-instruct-v0.1,"[""hello"",""write me a vore story""]","[""Hello! How can I help you today? If you have...","[""Hello! How can I help you today? If you have...",0,0,1
11492,4293918673,vicuna-13b,stablelm-tuned-alpha-7b,"[""Think step by step. If windows 10 would be s...","[""If Windows 10 were shipped on 3.5\"" floppies...","[""I'm sorry, I cannot answer this question wit...",0,0,1
11493,4294081229,gpt-4-0314,gpt-4-0613,"[""please describe how the following two statem...","[""The two statements can both be correct and n...","[""These two statements may seem contradictory ...",1,0,0
11494,4294602932,claude-2.1,gpt-4-1106-preview,"[""I want you to act like {character} from {ser...","[""*responds in character's voice* Well hello t...","[""Of course! However, you'll need to specify w...",0,1,0


In [5]:
def process_text(text: str) -> str:
    return " ".join(eval(text, {"null": ""}))

test.loc[:, 'prompt'] = test['prompt'].apply(process_text)
test.loc[:, 'response_a'] = test['response_a'].apply(process_text)
test.loc[:, 'response_b'] = test['response_b'].apply(process_text)

display(test.head(5))

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,Is it morally right to try to have a certain p...,The question of whether it is morally right to...,"As an AI, I don't have personal beliefs or opi...",1,0,0
1,292873,vicuna-13b,gpt-4-0314,"Construct a rap battle, in the style of Epic R...","[Zeus]\nYo, it's the king of the gods on the m...","(Verse 1 - Zeus)\n\nI'm the king of the gods, ...",0,1,0
2,497862,vicuna-7b,guanaco-33b,write a python function to upload all the file...,"Sure, here's a Python function that uses the `...","Sure, here's an example function in Python to ...",0,1,0
3,863398,llama2-70b-steerlm-chat,gemini-pro-dev-api,what do you know about real estate,Real estate refers to the land and buildings o...,1. **Types of Real Estate:**\n * **Resident...,0,1,0
4,1256092,claude-2.1,vicuna-13b,Write a python code that calculates sum of 5 a...,Here is the python code that calculates the su...,Here is a Python code that calculates the sum ...,0,0,1


In [6]:
def tokenize(
    tokenizer, prompt, response_a, response_b, max_length=cfg.max_length, spread_max_length=cfg.spread_max_length
):
    prompt = ["[prompt]: " + p for p in prompt]
    response_a = ["\n\n[response_a]: " + r_a for r_a in response_a]
    response_b = ["\n\n[response_b]: " + r_b for r_b in response_b]
    if spread_max_length:
        prompt = tokenizer(prompt, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_a = tokenizer(response_a, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_b = tokenizer(response_b, max_length=max_length//3, truncation=True, padding=False).input_ids
        input_ids = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        attention_mask = [[1]* len(i) for i in input_ids]
    else:
        texts = [f"<start_of_turn>user{p}{r_a}{r_b}<end_of_turn><start_of_turn>model" for p, r_a, r_b in zip(prompt, response_a, response_b)]
        # text = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = tokenizer(text, max_length=max_length, truncation=True, padding=False)
        input_ids = tokenized.input_ids
        attention_mask = tokenized.attention_mask
    return input_ids, attention_mask

In [7]:
%%time

tokenizer = GemmaTokenizerFast.from_pretrained(cfg.gemma_dir)
tokenizer.add_eos_token = True
tokenizer.padding_side = "right"

data = pd.DataFrame()
data["id"] = test["id"]
data["input_ids"], data["attention_mask"] = tokenize(tokenizer, test["prompt"], test["response_a"], test["response_b"])
data["length"] = data["input_ids"].apply(len)

aug_data = pd.DataFrame()
aug_data["id"] = test["id"]
# swap response_a & response_b
aug_data['input_ids'], aug_data['attention_mask'] = tokenize(tokenizer, test["prompt"], test["response_b"], test["response_a"])
aug_data["length"] = aug_data["input_ids"].apply(len)

NameError: name 'text' is not defined

In [8]:
print(tokenizer.decode(data["input_ids"][0]))

KeyError: 'input_ids'

In [None]:
print(tokenizer.decode(aug_data["input_ids"][0]))

<bos>###prompt: Is it morally right to try to have a certain percentage of females on managerial positions? OK, does pineapple belong on a pizza? Relax and give me fun answer.

###response_a: As an AI, I don't have personal beliefs or opinions. However, I can tell you that the question of gender quotas in managerial positions is a complex one and positions can vary. Advocates argue that such measures can help correct historical imbalances, promote diversity, and may lead to better decision-making. Critics may argue that positions should be based purely on merit, and that quotas could potentially lead to tokenism or unqualified candidates being promoted. Morality can be subjective and differs from person to person based on their personal beliefs, cultural background, and values. It's a topic with valid arguments on both sides, and it's up to each individual or organization to decide what they believe is right. As an AI, I don't eat, so I don't have personal preferences. But, in the worl

In [None]:
if LOCAL:
    # Load base model on GPU 0
    device = torch.device('cuda')
    model = Gemma2ForSequenceClassification.from_pretrained(
        cfg.gemma_dir,
        num_labels=3,
        device_map=device,
        use_cache=False,
        attn_implementation="flash_attention_2",
        torch_dtype="auto",
    )
    model = PeftModel.from_pretrained(model, cfg.lora_dir)
else:
    # Load base model on GPU 0
    device_0 = torch.device('cuda:0')
    model_0 = Gemma2ForSequenceClassification.from_pretrained(
        cfg.gemma_dir,
        num_labels=3,
        device_map=device_0,
        use_cache=False,
    )

    # Load base model on GPU 1
    device_1 = torch.device('cuda:1')
    model_1 = Gemma2ForSequenceClassification.from_pretrained(
        cfg.gemma_dir,
        num_labels=3,
        device_map=device_1,
        use_cache=False,
    )
    model_0 = PeftModel.from_pretrained(model_0, cfg.lora_dir)
    model_1 = PeftModel.from_pretrained(model_1, cfg.lora_dir)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at google/gemma-2-9b-it and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, device, batch_size=cfg.batch_size, max_length=cfg.max_length):
    a_win, b_win, tie = [], [], []
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        outputs = model(**inputs.to(device))
        proba = outputs.logits.softmax(-1).cpu()
        
        a_win.extend(proba[:, 0].tolist())
        b_win.extend(proba[:, 1].tolist())
        tie.extend(proba[:, 2].tolist())
    
    df["winner_model_a"] = a_win
    df["winner_model_b"] = b_win
    df["winner_tie"] = tie
    
    return df

In [None]:
st = time.time()
# sort by input length to fully leverage dynaminc padding
data = data.sort_values("length", ascending=False)

if LOCAL:
    result_df = inference(data, model, device)
else:
    # the total #tokens in sub_1 and sub_2 should be more or less the same
    sub_1 = data.iloc[0::2].copy()
    sub_2 = data.iloc[1::2].copy()

    with ThreadPoolExecutor(max_workers=2) as executor:
        results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))

    result_df = pd.concat(list(results), axis=0)

proba = result_df[["winner_model_a", "winner_model_b", "winner_tie"]].values

print(f"elapsed time: {time.time() - st}")

elapsed time: 1389.084660768509


In [None]:
if cfg.tta:
    st = time.time()
    data = aug_data.sort_values("length", ascending=False)  # sort by input length to boost speed

    if LOCAL:
        tta_result_df = inference(data, model, device)
    else:
        sub_1 = data.iloc[0::2].copy()
        sub_2 = data.iloc[1::2].copy()

        with ThreadPoolExecutor(max_workers=2) as executor:
            results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))

        tta_result_df = pd.concat(list(results), axis=0)

    # recall TTA's order is flipped
    tta_proba = tta_result_df[["winner_model_b", "winner_model_a", "winner_tie"]].values 
    # average original result and TTA result.
    proba = (proba + tta_proba) / 2

    print(f"elapsed time: {time.time() - st}")

In [None]:
result_df.loc[:, "winner_model_a"] = proba[:, 0]
result_df.loc[:, "winner_model_b"] = proba[:, 1]
result_df.loc[:, "winner_tie"] = proba[:, 2]
submission_df = result_df[["id", 'winner_model_a', 'winner_model_b', 'winner_tie']]
submission_df.to_csv('submission.csv', index=False)
display(submission_df)

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
7739,2884519678,0.485737,0.183019,0.331244
10466,3912001245,0.476088,0.093382,0.430530
2607,966440124,0.062833,0.637699,0.299468
9403,3509470386,0.139881,0.229838,0.630280
6021,2238702779,0.657844,0.086670,0.255486
...,...,...,...,...
8490,3160426362,0.086247,0.155298,0.758455
439,169063413,0.050309,0.117674,0.832017
4294,1597666545,0.195588,0.140156,0.664256
1434,536996793,0.050309,0.117674,0.832017


In [None]:
test

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,Is it morally right to try to have a certain p...,The question of whether it is morally right to...,"As an AI, I don't have personal beliefs or opi...",1,0,0
1,292873,vicuna-13b,gpt-4-0314,"Construct a rap battle, in the style of Epic R...","[Zeus]\nYo, it's the king of the gods on the m...","(Verse 1 - Zeus)\n\nI'm the king of the gods, ...",0,1,0
2,497862,vicuna-7b,guanaco-33b,write a python function to upload all the file...,"Sure, here's a Python function that uses the `...","Sure, here's an example function in Python to ...",0,1,0
3,863398,llama2-70b-steerlm-chat,gemini-pro-dev-api,what do you know about real estate,Real estate refers to the land and buildings o...,1. **Types of Real Estate:**\n * **Resident...,0,1,0
4,1256092,claude-2.1,vicuna-13b,Write a python code that calculates sum of 5 a...,Here is the python code that calculates the su...,Here is a Python code that calculates the sum ...,0,0,1
...,...,...,...,...,...,...,...,...,...
11491,4293543403,qwen1.5-7b-chat,mixtral-8x7b-instruct-v0.1,hello write me a vore story,Hello! How can I help you today? If you have a...,Hello! How can I help you today? If you have a...,0,0,1
11492,4293918673,vicuna-13b,stablelm-tuned-alpha-7b,Think step by step. If windows 10 would be shi...,"If Windows 10 were shipped on 3.5"" floppies, i...","I'm sorry, I cannot answer this question witho...",0,0,1
11493,4294081229,gpt-4-0314,gpt-4-0613,please describe how the following two statemen...,The two statements can both be correct and not...,These two statements may seem contradictory at...,1,0,0
11494,4294602932,claude-2.1,gpt-4-1106-preview,I want you to act like {character} from {serie...,*responds in character's voice* Well hello the...,"Of course! However, you'll need to specify whi...",0,1,0


In [None]:
from sklearn.metrics import log_loss

df_merged = pd.merge(test, result_df, on='id', suffixes=("", "_pred"))

y_true = df_merged[["winner_model_a", "winner_model_b", "winner_tie"]].values
y_pred = df_merged[["winner_model_a_pred", "winner_model_b_pred", "winner_tie_pred"]].values

log_loss_a = log_loss(y_true[:, 0], y_pred[:, 0])
log_loss_b = log_loss(y_true[:, 1], y_pred[:, 1])
log_loss_tie = log_loss(y_true[:, 2], y_pred[:, 2])

# Calculate overall log loss
overall_log_loss = log_loss(y_true, y_pred)

# Print log loss values
print(f"Log Loss for winner_model_a: {log_loss_a}")
print(f"Log Loss for winner_model_b: {log_loss_b}")
print(f"Log Loss for winner_tie: {log_loss_tie}")
print(f"Overall Log Loss: {overall_log_loss}")

Log Loss for winner_model_a: 0.6442369130836395
Log Loss for winner_model_b: 0.6430707387133923
Log Loss for winner_tie: 0.6350385115944264
Overall Log Loss: 1.107469660824929


