<a href="https://colab.research.google.com/github/jxm020202/Multilingual-Chatbot-WDSM-CUP/blob/main/Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

wsdm_cup_multilingual_chatbot_arena_path = kagglehub.competition_download('wsdm-cup-multilingual-chatbot-arena')
jxm222_checkpoint_8th_path = kagglehub.dataset_download('jxm222/checkpoint-8th')
jxm222_wheels_path = kagglehub.dataset_download('jxm222/wheels')
emiz6413_gemma_2_transformers_gemma_2_9b_it_4bit_1_path = kagglehub.model_download('emiz6413/gemma-2/Transformers/gemma-2-9b-it-4bit/1')

print('Data source import complete.')


In [None]:
!pip install peft accelerate transformers bitsandbytes\
    -U --no-index --find-links /kaggle/input/wheels

In [None]:
import time
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor

import torch
import sklearn
import numpy as np
import pandas as pd
from transformers import Gemma2ForSequenceClassification, GemmaTokenizerFast, BitsAndBytesConfig
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from peft import PeftModel

In [None]:
assert torch.cuda.device_count() == 2

## Configurations

In [None]:
@dataclass
class Config:
    gemma_dir = '/kaggle/input/gemma-2/transformers/gemma-2-9b-it-4bit/1/gemma-2-9b-it-4bit'
    lora_dir = '/kaggle/input/checkpoint-8th'
    max_length = 512*3
    batch_size = 4
    device = torch.device("cuda")
    tta = True
    spread_max_length = False  # whether to apply max_length//3 on each input or max_length on the concatenated input

cfg = Config()

# Load & pre-process Data

In [None]:
test = pd.read_parquet('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet')

In [None]:
test

In [None]:
def process_text(text: str) -> str:
    return text.replace("null", "").strip()


test.loc[:, 'prompt'] = test['prompt'].apply(process_text)
test.loc[:, 'response_a'] = test['response_a'].apply(process_text)
test.loc[:, 'response_b'] = test['response_b'].apply(process_text)

# Tokenize

In [None]:
def tokenize(
    tokenizer, prompt, response_a, response_b, max_length=cfg.max_length, spread_max_length=cfg.spread_max_length
):
    # TODO: change prompt
    prompt = ["<prompt>: " + p for p in prompt]
    response_a = ["\n\n<response_a>: " + r_a for r_a in response_a]
    response_b = ["\n\n<response_b>: " + r_b for r_b in response_b]
    if spread_max_length:
        prompt = tokenizer(prompt, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_a = tokenizer(response_a, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_b = tokenizer(response_b, max_length=max_length//3, truncation=True, padding=False).input_ids
        input_ids = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        attention_mask = [[1]* len(i) for i in input_ids]
    else:
        text = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = tokenizer(text, max_length=max_length, truncation=True, padding=False)
        input_ids = tokenized.input_ids
        attention_mask = tokenized.attention_mask
    return input_ids, attention_mask

In [None]:
%%time

tokenizer = GemmaTokenizerFast.from_pretrained(cfg.gemma_dir)
tokenizer.add_eos_token = True
tokenizer.padding_side = "right"

data = pd.DataFrame()
data["id"] = test["id"]
data["input_ids"], data["attention_mask"] = tokenize(tokenizer, test["prompt"], test["response_a"], test["response_b"])
data["length"] = data["input_ids"].apply(len)

aug_data = pd.DataFrame()
aug_data["id"] = test["id"]
# swap response_a & response_b
aug_data['input_ids'], aug_data['attention_mask'] = tokenize(tokenizer, test["prompt"], test["response_b"], test["response_a"])
aug_data["length"] = aug_data["input_ids"].apply(len)

# Load model

In [None]:
# Load base model on GPU 0
device_0 = torch.device('cuda:0')
model_0 = Gemma2ForSequenceClassification.from_pretrained(
    cfg.gemma_dir,
    device_map=device_0,
    use_cache=False,
)

# Load base model on GPU 1
device_1 = torch.device('cuda:1')
model_1 = Gemma2ForSequenceClassification.from_pretrained(
    cfg.gemma_dir,
    device_map=device_1,
    use_cache=False,
)

#### Load LoRA adapter

In [None]:

model_0 = PeftModel.from_pretrained(model_0, cfg.lora_dir)
model_1 = PeftModel.from_pretrained(model_1, cfg.lora_dir)

# Inference

In [None]:
@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, device, batch_size=cfg.batch_size, max_length=cfg.max_length):
    # Initialize lists to store probabilities
    a_win, b_win = [], []

    for start_idx in range(0, len(df), batch_size):
        # Select the current batch
        tmp = df.iloc[start_idx:start_idx + batch_size]
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {
                "input_ids": tmp["input_ids"].to_list(),
                "attention_mask": tmp["attention_mask"].to_list(),
            },
            padding="longest",
            return_tensors="pt",
        )
        # Perform inference
        outputs = model(**inputs.to(device))
        proba = outputs.logits.softmax(-1).cpu()

        # Append probabilities
        a_win.extend(proba[:, 0].tolist())
        b_win.extend(proba[:, 1].tolist())

    # Create a copy of the dataframe and add the results
    result_df = df.copy()
    result_df["winner_model_a"] = a_win
    result_df["winner_model_b"] = b_win

    return result_df

In [None]:
st = time.time()

# sort by input length to fully leverage dynaminc padding
data = data.sort_values("length", ascending=False)
# the total #tokens in sub_1 and sub_2 should be more or less the same
sub_1 = data.iloc[0::2].copy()
sub_2 = data.iloc[1::2].copy()

with ThreadPoolExecutor(max_workers=2) as executor:
    results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))

result_df = pd.concat(list(results), axis=0)
proba = result_df[["winner_model_a", "winner_model_b"]].values

print(f"elapsed time: {time.time() - st}")

In [None]:
st = time.time()

if cfg.tta:
    # Sort augmented data by length for efficient dynamic padding
    aug_data = aug_data.sort_values("length", ascending=False)

    # Split into two subsets for parallel processing
    sub_1, sub_2 = aug_data.iloc[0::2], aug_data.iloc[1::2]

    # Perform inference with TTA using ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=2) as executor:
        results = executor.map(inference, [sub_1, sub_2], [model_0, model_1], [device_0, device_1])

    tta_result_df = pd.concat(results, axis=0)
    tta_proba = tta_result_df[["winner_model_a", "winner_model_b"]].values

    # Combine original and TTA probabilities by averaging
    proba = (proba + tta_proba) / 2

In [None]:


def determine_winner(row):
    return "model_a" if row["winner_model_a"] > row["winner_model_b"] else "model_b"


# Store probabilities in the result dataframe
result_df.loc[:, "winner_model_a"] = proba[:, 0]
result_df.loc[:, "winner_model_b"] = proba[:, 1]

# Determine winners
result_df["winner"] = result_df.apply(determine_winner, axis=1)

# Save submission
submission_df = result_df[["id", "winner"]]
submission_df.to_csv('submission.csv', index=False)
display(submission_df)