In [1]:
import time
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor

from datasets import load_dataset, load_from_disk
import torch
import sklearn
import numpy as np
import pandas as pd
from transformers import Gemma2ForSequenceClassification, GemmaTokenizerFast, BitsAndBytesConfig
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from peft import PeftModel

In [2]:
from dataclasses import dataclass, field
import torch

# Define the Config class with the specified attributes and values
@dataclass
class Config:
    gemma_dir: str = '/kaggle/input/gemma-2/transformers/gemma-2-9b-it-4bit/1/gemma-2-9b-it-4bit'
    lora_dir: str = '/kaggle/input/73zap2gx/checkpoint-5748'
    max_length: int = 2048
    batch_size: int = 16
    device: torch.device = field(default_factory=lambda: torch.device("cuda"))
    tta: bool = False
    spread_max_length: bool = True 

In [3]:
# Set the config variable based on the LOCAL flag
LOCAL = True  # Set this flag accordingly

if LOCAL:
    cfg = Config(
        gemma_dir='unsloth/gemma-2-9b-it-bnb-4bit',
        lora_dir='./output/gemma-seq-continue_training-focal-loss/checkpoint-48',
    )
    test_path = "./data/train.csv"
    # test = pd.read_csv(test_path)
    # Load eval_ds to pd
    eval_ds_dir = "./output/gemma-seq-continue_training2/eval_ds"
    eval_ds = load_from_disk(eval_ds_dir)
    test = eval_ds.to_pandas()
    # Use half of the dataframe for testing
    test = test.iloc[:len(test)//2]
    # Select where index % config.n_splits (5) == 0
    # test = test[test.index % 10 == 0]
    # Reset the index
    # test.reset_index(drop=True, inplace=True)
else:
    cfg = Config()
    test_path = "/kaggle/input/lmsys-chatbot-arena/test.csv"
    test = pd.read_csv(test_path)

In [4]:
def process_text(text: str) -> str:
    return " ".join(eval(text, {"null": ""}))

test.loc[:, 'prompt'] = test['prompt'].apply(process_text)
test.loc[:, 'response_a'] = test['response_a'].apply(process_text)
test.loc[:, 'response_b'] = test['response_b'].apply(process_text)

display(test.head(5))

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie,input_ids,attention_mask,labels
0,30192,gpt-4-1106-preview,gpt-4-0613,Is it morally right to try to have a certain p...,The question of whether it is morally right to...,"As an AI, I don't have personal beliefs or opi...",1,0,0,"[2, 235322, 39038, 78880, 2125, 665, 89397, 18...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
1,292873,vicuna-13b,gpt-4-0314,"Construct a rap battle, in the style of Epic R...","[Zeus]\nYo, it's the king of the gods on the m...","(Verse 1 - Zeus)\n\nI'm the king of the gods, ...",0,1,0,"[2, 235322, 39038, 78880, 58863, 476, 13483, 9...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
2,497862,vicuna-7b,guanaco-33b,write a python function to upload all the file...,"Sure, here's a Python function that uses the `...","Sure, here's an example function in Python to ...",0,1,0,"[2, 235322, 39038, 78880, 5598, 476, 17706, 14...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
3,863398,llama2-70b-steerlm-chat,gemini-pro-dev-api,what do you know about real estate,Real estate refers to the land and buildings o...,1. **Types of Real Estate:**\n * **Resident...,0,1,0,"[2, 235322, 39038, 78880, 1212, 749, 692, 1230...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
4,1256092,claude-2.1,vicuna-13b,Write a python code that calculates sum of 5 a...,Here is the python code that calculates the su...,Here is a Python code that calculates the sum ...,0,0,1,"[2, 235322, 39038, 78880, 15615, 476, 17706, 3...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2


In [5]:
def tokenize(
    tokenizer, prompt, response_a, response_b, max_length=cfg.max_length, spread_max_length=cfg.spread_max_length
):
    prompt = ["<prompt>: " + p for p in prompt]
    response_a = ["\n\n<response_a>: " + r_a for r_a in response_a]
    response_b = ["\n\n<response_b>: " + r_b for r_b in response_b]
    if spread_max_length:
        prompt = tokenizer(prompt, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_a = tokenizer(response_a, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_b = tokenizer(response_b, max_length=max_length//3, truncation=True, padding=False).input_ids
        input_ids = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        attention_mask = [[1]* len(i) for i in input_ids]
    else:
        # texts = [f"<start_of_turn>user{p}{r_a}{r_b}<end_of_turn><start_of_turn>model" for p, r_a, r_b in zip(prompt, response_a, response_b)]
        text = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = tokenizer(text, max_length=max_length, truncation=True, padding=False)
        print(tokenized)
        input_ids = tokenized.input_ids
        attention_mask = tokenized.attention_mask
    return input_ids, attention_mask



In [6]:
%%time

tokenizer = GemmaTokenizerFast.from_pretrained(cfg.gemma_dir)
tokenizer.add_eos_token = True
tokenizer.padding_side = "right"

data = pd.DataFrame()
data["id"] = test["id"]
data["input_ids"], data["attention_mask"] = tokenize(tokenizer, test["prompt"], test["response_a"], test["response_b"])
data["length"] = data["input_ids"].apply(len)

aug_data = pd.DataFrame()
aug_data["id"] = test["id"]
# swap response_a & response_b
aug_data['input_ids'], aug_data['attention_mask'] = tokenize(tokenizer, test["prompt"], test["response_b"], test["response_a"])
aug_data["length"] = aug_data["input_ids"].apply(len)

CPU times: user 18.9 s, sys: 473 ms, total: 19.3 s
Wall time: 2.68 s


In [7]:
print(tokenizer.decode(data["input_ids"][0]))

<bos><prompt>: Is it morally right to try to have a certain percentage of females on managerial positions? OK, does pineapple belong on a pizza? Relax and give me fun answer.<eos><bos>

<response_a>: The question of whether it is morally right to aim for a certain percentage of females in managerial positions is a complex ethical issue that involves considerations of fairness, equality, diversity, and discrimination.

Here are some arguments in favor of and against such policies:

**Arguments in favor:**

1. **Correcting Historical Inequities:** Women have historically been underrepresented in leadership roles due to various cultural, institutional, and social barriers. Aiming for a specific percentage can be seen as a corrective measure to address past and ongoing discrimination.

2. **Promoting Diversity:** Diverse leadership teams can enhance decision-making and represent a broader range of perspectives. This can lead to better outcomes for organizations and society as a whole.

3. 

In [8]:
print(tokenizer.decode(aug_data["input_ids"][0]))

<bos><prompt>: Is it morally right to try to have a certain percentage of females on managerial positions? OK, does pineapple belong on a pizza? Relax and give me fun answer.<eos><bos>

<response_a>: As an AI, I don't have personal beliefs or opinions. However, I can tell you that the question of gender quotas in managerial positions is a complex one and positions can vary. Advocates argue that such measures can help correct historical imbalances, promote diversity, and may lead to better decision-making. Critics may argue that positions should be based purely on merit, and that quotas could potentially lead to tokenism or unqualified candidates being promoted. Morality can be subjective and differs from person to person based on their personal beliefs, cultural background, and values. It's a topic with valid arguments on both sides, and it's up to each individual or organization to decide what they believe is right. As an AI, I don't eat, so I don't have personal preferences. But, in 

In [9]:
if LOCAL:
    # Load base model on GPU 0
    device = torch.device('cuda')
    model = Gemma2ForSequenceClassification.from_pretrained(
        cfg.gemma_dir,
        num_labels=3,
        device_map=device,
        attn_implementation="flash_attention_2",
        torch_dtype="auto",
    )
    model = PeftModel.from_pretrained(model, cfg.lora_dir)
else:
    # Load base model on GPU 0
    device_0 = torch.device('cuda:0')
    model_0 = Gemma2ForSequenceClassification.from_pretrained(
        cfg.gemma_dir,
        num_labels=3,
        device_map=device_0,
    )

    # Load base model on GPU 1
    device_1 = torch.device('cuda:1')
    model_1 = Gemma2ForSequenceClassification.from_pretrained(
        cfg.gemma_dir,
        num_labels=3,
        device_map=device_1,
    )
    model_0 = PeftModel.from_pretrained(model_0, cfg.lora_dir)
    model_1 = PeftModel.from_pretrained(model_1, cfg.lora_dir)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at unsloth/gemma-2-9b-it-bnb-4bit and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from tqdm import tqdm

@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, device, batch_size=cfg.batch_size, max_length=cfg.max_length):
    # Preallocate arrays for results
    a_win = torch.zeros(len(df), dtype=torch.float32, device='cpu')
    b_win = torch.zeros(len(df), dtype=torch.float32, device='cpu')
    tie = torch.zeros(len(df), dtype=torch.float32, device='cpu')
    
    model.eval()
    
    # Use tqdm for progress bar
    for start_idx in tqdm(range(0, len(df), batch_size), desc="Inference"):
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        
        outputs = model(**inputs.to(device))
        proba = outputs.logits.softmax(-1).cpu()
        
        # Assign values to preallocated arrays
        a_win[start_idx:end_idx] = proba[:, 0]
        b_win[start_idx:end_idx] = proba[:, 1]
        tie[start_idx:end_idx] = proba[:, 2]
    
    # Add the results to the dataframe
    df["winner_model_a"] = a_win.numpy()
    df["winner_model_b"] = b_win.numpy()
    df["winner_tie"] = tie.numpy()
    
    return df

In [11]:
st = time.time()
# sort by input length to fully leverage dynaminc padding
data = data.sort_values("length", ascending=False)

if LOCAL:
    result_df = inference(data, model, device)
else:
    # the total #tokens in sub_1 and sub_2 should be more or less the same
    sub_1 = data.iloc[0::2].copy()
    sub_2 = data.iloc[1::2].copy()

    with ThreadPoolExecutor(max_workers=2) as executor:
        results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))

    result_df = pd.concat(list(results), axis=0)

proba = result_df[["winner_model_a", "winner_model_b", "winner_tie"]].values

print(f"elapsed time: {time.time() - st}")

Inference:   0%|                                                                                                                                                                                                                                                      | 0/360 [00:00<?, ?it/s]

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.
Inference: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 360/360 [12:24<00:00,  2.07s/it]

elapsed time: 744.2435967922211





In [12]:
if cfg.tta:
    st = time.time()
    data = aug_data.sort_values("length", ascending=False)  # sort by input length to boost speed

    if LOCAL:
        tta_result_df = inference(data, model, device)
    else:
        sub_1 = data.iloc[0::2].copy()
        sub_2 = data.iloc[1::2].copy()

        with ThreadPoolExecutor(max_workers=2) as executor:
            results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))

        tta_result_df = pd.concat(list(results), axis=0)

    # recall TTA's order is flipped
    tta_proba = tta_result_df[["winner_model_b", "winner_model_a", "winner_tie"]].values 
    # average original result and TTA result.
    proba = (proba + tta_proba) / 2

    print(f"elapsed time: {time.time() - st}")

In [13]:
result_df.loc[:, "winner_model_a"] = proba[:, 0]
result_df.loc[:, "winner_model_b"] = proba[:, 1]
result_df.loc[:, "winner_tie"] = proba[:, 2]
submission_df = result_df[["id", 'winner_model_a', 'winner_model_b', 'winner_tie']]
submission_df.to_csv('submission.csv', index=False)
display(submission_df)

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
5026,1868849650,0.289792,0.378607,0.331600
664,253156481,0.186876,0.534638,0.278486
5382,2004279155,0.346940,0.257761,0.395300
1888,701950675,0.237280,0.353687,0.409033
1895,705199188,0.214369,0.416905,0.368727
...,...,...,...,...
896,338861224,0.004233,0.009247,0.986520
2804,1045443401,0.038212,0.049450,0.912337
1434,536996793,0.003886,0.007490,0.988625
439,169063413,0.003886,0.007490,0.988625


In [14]:
from sklearn.metrics import log_loss

df_merged = pd.merge(test, result_df, on='id', suffixes=("", "_pred"))

y_true = df_merged[["winner_model_a", "winner_model_b", "winner_tie"]].values
y_pred = df_merged[["winner_model_a_pred", "winner_model_b_pred", "winner_tie_pred"]].values

log_loss_a = log_loss(y_true[:, 0], y_pred[:, 0])
log_loss_b = log_loss(y_true[:, 1], y_pred[:, 1])
log_loss_tie = log_loss(y_true[:, 2], y_pred[:, 2])

# Calculate overall log loss
overall_log_loss = log_loss(y_true, y_pred)

# Print log loss values
print(f"Log Loss for winner_model_a: {log_loss_a}")
print(f"Log Loss for winner_model_b: {log_loss_b}")
print(f"Log Loss for winner_tie: {log_loss_tie}")
print(f"Overall Log Loss: {overall_log_loss}")

# save overall log loss to output dir
with open(cfg.lora_dir + "/eval.txt", "w") as f:
    f.write("log loss: " + str(overall_log_loss))

Log Loss for winner_model_a: 0.5268538583566946
Log Loss for winner_model_b: 0.5263115294082981
Log Loss for winner_tie: 0.5650763313380778
Overall Log Loss: 0.9160198263321147


In [15]:
logits_tensor = torch.cat(logits_list, dim=0)
labels = torch.tensor(test[["winner_model_a", "winner_model_b", "winner_tie"]].values, dtype=torch.long, device=device)
print(logits_tensor.shape, labels.shape)

NameError: name 'logits_list' is not defined

In [None]:
class TemperatureScaling(torch.nn.Module):
    def __init__(self):
        super(TemperatureScaling, self).__init__()
        self.temperature = torch.nn.Parameter(torch.ones(1) * 1.5)  # Initial temperature

    def forward(self, logits):
        return logits / self.temperature

: 

In [None]:
logits_tensor.dtype

torch.float16

: 

In [None]:
labels = labels.type(torch.float16)

: 

In [None]:
from torch.nn import functional as F

def optimize_temperature(logits, labels):
    temperature_model = TemperatureScaling().to(device)
    optimizer = torch.optim.LBFGS([temperature_model.temperature], lr=0.0001, max_iter=10000)

    def loss_fn():
        optimizer.zero_grad()
        loss = F.cross_entropy(temperature_model(logits), labels)
        loss.backward()
        return loss

    optimizer.step(loss_fn)
    return temperature_model.temperature.item()

optimal_temperature = optimize_temperature(logits_tensor.to(device), labels.to(device))
print(f"Optimal Temperature: {optimal_temperature}")

Optimal Temperature: 1.6404436826705933


: 

In [None]:
y_pred

array([[0.38301918, 0.05470297, 0.56227791],
       [0.13922425, 0.51501262, 0.34576312],
       [0.34407678, 0.32063654, 0.33528671],
       ...,
       [0.18897924, 0.32525381, 0.48576692],
       [0.1751326 , 0.20940168, 0.61546576],
       [0.15700088, 0.52085418, 0.32214487]])

: 

In [None]:
# Calculate the log loss with the optimal temperature
from sklearn.metrics import log_loss

optimal_temperature = 1.0

y_true = df_merged[["winner_model_a", "winner_model_b", "winner_tie"]].values
y_pred = df_merged[["winner_model_a_pred", "winner_model_b_pred", "winner_tie_pred"]].values

log_loss_a = log_loss(y_true[:, 0], y_pred[:, 0] / optimal_temperature)
log_loss_b = log_loss(y_true[:, 1], y_pred[:, 1] / optimal_temperature)
log_loss_tie = log_loss(y_true[:, 2], y_pred[:, 2] / optimal_temperature)

# Calculate overall log loss
overall_log_loss = log_loss(y_true, (logits_tensor / optimal_temperature).softmax(-1).cpu().numpy())

# Print log loss values
print(f"Log Loss for winner_model_a: {log_loss_a}")
print(f"Log Loss for winner_model_b: {log_loss_b}")
print(f"Log Loss for winner_tie: {log_loss_tie}")
print(f"Overall Log Loss: {overall_log_loss}")

Log Loss for winner_model_a: 0.5351381970630059
Log Loss for winner_model_b: 0.5330849072025453
Log Loss for winner_tie: 0.5664459252950702
Overall Log Loss: 1.3668954295638178


: 

In [None]:
y_pred.shape

(5748, 3)

: 

In [None]:
(logits_tensor[0] / 3).softmax(-1).cpu()

: 

In [None]:
(logits_tensor[0]).softmax(-1).cpu()

: 

: 