<a href="https://colab.research.google.com/github/jxm020202/Multilingual-Chatbot-WDSM-CUP/blob/main/Copy_of_gemma_training_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
'''# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()
'''

'# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.\nimport kagglehub\nkagglehub.login()\n'

In [None]:
'''from google.colab import drive
drive.mount('/content/drive')
'''

"from google.colab import drive\ndrive.mount('/content/drive')\n"

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

wsdm_cup_multilingual_chatbot_arena_path = kagglehub.competition_download('wsdm-cup-multilingual-chatbot-arena')
emiz6413_73zap2gx_path = kagglehub.dataset_download('emiz6413/73zap2gx')
jxm222_checkpoint_8th_path = kagglehub.dataset_download('jxm222/checkpoint-8th')

print('Data source import complete.')


Data source import complete.


In [None]:
# Install necessary libraries
!pip install -U transformers>=4.42.3 bitsandbytes accelerate peft datasets scikit-learn

In [None]:
import os
import copy
from dataclasses import dataclass
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import (
    BitsAndBytesConfig,
    Gemma2ForSequenceClassification,
    GemmaTokenizerFast,
    Gemma2Config,
    PreTrainedTokenizerBase,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType, PeftModel
from sklearn.metrics import log_loss, accuracy_score

In [None]:
@dataclass
class Config:
    output_dir: str = "/content/drive/MyDrive/wdsm-checkpoints"

    checkpoint: str = "unsloth/gemma-2-9b-it-bnb-4bit"  # 4-bit quantized gemma-2-9b-instruct
    train_parquet: str = wsdm_cup_multilingual_chatbot_arena_path +"/train.parquet"
    max_length: int = 820
    n_splits: int = 5
    fold_idx: int = 0
    optim_type: str = "adamw_8bit"
    per_device_train_batch_size: int = 4
    per_device_eval_batch_size: int = 8
    n_epochs: int = 1
    freeze_layers: int = 16  # there're 42 layers in total, we don't add adapters to the first 16 layers
    lr: float = 3e-4
    lora_r: int = 64
    lora_alpha: float = lora_r * 2
    lora_dropout: float = 0.05
    lora_bias: str = "none"
    lora_dir = "/content/checkpoint-9"

config = Config()

In [None]:
training_args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,

    num_train_epochs=config.n_epochs,
    per_device_train_batch_size=config.per_device_train_batch_size,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    logging_steps=1,
    eval_strategy="epoch",
    save_strategy="steps",
    save_steps=50,
    optim=config.optim_type,
    fp16=True,
    learning_rate=config.lr,

)

In [None]:
lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    # only target self-attention
    target_modules=["q_proj", "k_proj", "v_proj"],
    layers_to_transform=[i for i in range(42) if i >= config.freeze_layers],
    lora_dropout=config.lora_dropout,
    bias=config.lora_bias,
    task_type=TaskType.SEQ_CLS,
)

In [None]:
tokenizer = GemmaTokenizerFast.from_pretrained(config.checkpoint)
tokenizer.add_eos_token = True  # We'll add <eos> at the end
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
model = Gemma2ForSequenceClassification.from_pretrained(
    config.checkpoint,
    num_labels=2,  # directly load 2 classes from start
    torch_dtype=torch.float16,
    device_map="auto",
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)

from peft import PeftModel

model = PeftModel.from_pretrained(model, config.lora_dir, device_map="auto")


# No manual classifier replacement needed now.


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at unsloth/gemma-2-9b-it-bnb-4bit and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def preprocess_and_tokenize(batch):
    sep_token = tokenizer.sep_token if tokenizer.sep_token is not None else "</s>"
    combined = [
        f"{p} {sep_token} {ra} {sep_token} {rb}"
        for p, ra, rb in zip(batch["prompt"], batch["response_a"], batch["response_b"])
    ]
    tokenized = tokenizer(
        combined,
        padding="max_length",
        truncation=True,
        max_length=config.max_length,
    )
    # Convert winner from "model_a" / "model_b" to 0 / 1
    labels = [0 if w == "model_a" else 1 for w in batch["winner"]]
    tokenized["labels"] = labels
    return tokenized

# Load full dataset
train_df = pd.read_parquet(config.train_parquet)

# Select the last 10% of the rows
data_length = len(train_df)
fraction = 1
start_idx = int(data_length * (1 - fraction))
subset_df = train_df.iloc[start_idx:].reset_index(drop=True)

hf_dataset = Dataset.from_pandas(subset_df)

# Apply tokenization and pairing
tokenized_dataset = hf_dataset.map(preprocess_and_tokenize, batched=True)

# Remove unnecessary columns
tokenized_dataset = tokenized_dataset.remove_columns(["prompt", "response_a", "response_b", "winner"])

# Set format for PyTorch
tokenized_dataset.set_format("torch")

# Train/Test split
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]


Map:   0%|          | 0/48439 [00:00<?, ? examples/s]

In [None]:
def compute_metrics(eval_preds: EvalPrediction) -> dict:
    preds = eval_preds.predictions
    labels = eval_preds.label_ids
    probs = torch.from_numpy(preds).float().softmax(-1).numpy()
    loss = log_loss(y_true=labels, y_pred=probs)
    acc = accuracy_score(y_true=labels, y_pred=preds.argmax(-1))
    return {"acc": acc, "log_loss": loss}

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    args=training_args,
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

  trainer = Trainer(


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mshivzzzzzz[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


In [None]:
trainer.model.save_pretrained(config.output_dir)