In [1]:
!pip install -U bitsandbytes "transformers>=4.45.1" accelerate peft

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting transformers>=4.45.1
  Downloading transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-1.1.1-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading transformers-4.46.2-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m91.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading accelerate-1.1.1-py3-none-any.whl (333 kB)
[2K   [90m━━━━━━━━━━━━━━

In [2]:
import os
import copy

import functools
from dataclasses import dataclass

import numpy as np
import pandas as pd

import torch
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

from datasets import Dataset
from transformers import (
    BitsAndBytesConfig,
    Gemma2ForSequenceClassification,
    GemmaTokenizerFast,
    Gemma2Config,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from huggingface_hub import HfFolder

from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, log_loss

In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("hf_token")

In [4]:
from huggingface_hub import login

login(
  token=hf_token,
  add_to_git_credential=True
)

Token is valid (permission: fineGrained).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
@dataclass
class Config:
    output_dir: str = "LitvinKA/gemma-2-9b-content-filter"
    checkpoint: str = "unsloth/gemma-2-9b-it-bnb-4bit"  # 4-bit quantized gemma-2-9b-instruct
    optim_type: str = "adamw_8bit"
    per_device_train_batch_size: int = 4
    gradient_accumulation_steps: int = 8
    per_device_eval_batch_size: int = 8
    n_epochs: int = 1
    freeze_layers: int = 16  # there're 42 layers in total, we don't add adapters to the first 16 layers
    lr: float = 2e-4
    warmup_ratio: int = 0.05
    lora_r: int = 16
    lora_alpha: float = lora_r * 2
    lora_dropout: float = 0.05
    lora_bias: str = "none"
    train_size: int = 150000
    val_size: int = 10000
config = Config()

In [6]:
training_args = TrainingArguments(
    output_dir="LitvinKA/gemma-2-9b-content-filter",
    num_train_epochs=config.n_epochs,
    per_device_train_batch_size=config.per_device_train_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    optim=config.optim_type,
    fp16=True,
    learning_rate=config.lr,
    warmup_ratio=config.warmup_ratio,  
    # logging & evaluation strategies
    logging_dir=f"{config.output_dir}/logs",
    logging_strategy="steps",
    logging_steps=800,
    eval_strategy="steps",
    eval_steps=800,
    save_strategy="steps",
    save_steps=800,
    # push to hub parameters
    report_to="none",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=config.output_dir,
    hub_token=HfFolder.get_token(),
)

In [7]:
lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    # only target self-attention
    target_modules=[
        "q_proj", "k_proj", "v_proj", 
        "gate_proj", "up_proj", "down_proj"
    ],
    layers_to_transform=[i for i in range(42) if i >= config.freeze_layers],
    lora_dropout=config.lora_dropout,
    bias=config.lora_bias,
    task_type=TaskType.SEQ_CLS,
)

In [8]:
model = Gemma2ForSequenceClassification.from_pretrained(
    config.checkpoint,
    num_labels=1,
    torch_dtype=torch.float16,
    device_map="auto",
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model

config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at unsloth/gemma-2-9b-it-bnb-4bit and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): Gemma2ForSequenceClassification(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 3584, padding_idx=0)
        (layers): ModuleList(
          (0-15): 16 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): Linear4bit(in_features=3584, out_features=4096, bias=False)
              (k_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
              (v_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
              (o_proj): Linear4bit(in_features=4096, out_features=3584, bias=False)
              (rotary_emb): Gemma2RotaryEmbedding()
            )
            (mlp): Gemma2MLP(
              (gate_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
              (up_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
              (down_proj): Linear4bit(in_features=14336, out_features=3584, bia

In [9]:
model.print_trainable_parameters()

trainable params: 30,248,448 || all params: 9,271,958,016 || trainable%: 0.3262


In [10]:
train_data = pd.read_csv('/kaggle/input/d/tatianamerzl/wb-winter-24/train.csv')
train_data.columns = ['ID', 'text', 'labels']

In [11]:
num_positive = train_data['labels'].sum()  
num_negative = len(train_data) - num_positive 

pos_weight = num_negative / num_positive

label_weights = torch.tensor([pos_weight], dtype=torch.float32, device=model.device)

In [12]:
from sklearn.model_selection import train_test_split

val_split_ratio = 0.10

train_data, validation_data = train_test_split(train_data, test_size=val_split_ratio, random_state=42, stratify=train_data['labels'].values, shuffle=True)

In [13]:
train_dataset = Dataset.from_pandas(train_data[:config.train_size])
validation_dataset = Dataset.from_pandas(validation_data[:config.val_size])

In [14]:
tokenizer = GemmaTokenizerFast.from_pretrained(config.checkpoint)
tokenizer.add_eos_token = True  # We'll add <eos> at the end
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [15]:
# Tokenize helper function
def tokenize(batch):
    tokenized_inputs = tokenizer(batch['text'])
    tokenized_inputs['labels'] = batch['labels']
    return tokenized_inputs

# Tokenize dataset
tokenized_train_data = train_dataset.map(tokenize, batched=True)
tokenized_validation_data = validation_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/150000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [16]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds.predictions, eval_preds.label_ids
    
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    
    pred_classes = (probs >= 0.5).astype(int)
    
    acc = accuracy_score(labels, pred_classes)
    precision = precision_score(labels, pred_classes)
    recall = recall_score(labels, pred_classes)
    f1 = f1_score(labels, pred_classes)
    loss = log_loss(labels, probs)
    
    return {
        "accuracy": acc,
        "log_loss": loss,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [17]:
class CustomTrainer(Trainer):
    def __init__(self, label_weights, **kwargs):
        super().__init__(**kwargs)
        self.label_weights = label_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        labels = labels.unsqueeze(1)
        outputs = model(**inputs)
        logits = outputs.get("logits")
        

        loss = F.binary_cross_entropy_with_logits(
            logits, labels.to(torch.float32), pos_weight=self.label_weights
        )
        
        return (loss, outputs) if return_outputs else loss


In [18]:
def collate_fn(batch, tokenizer):
    dict_keys = ['input_ids', 'attention_mask', 'labels']
    d = {k: [torch.tensor(dic[k]) for dic in batch] for k in dict_keys}
    d['input_ids'] = pad_sequence(
        d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    d['attention_mask'] = pad_sequence(
        d['attention_mask'], batch_first=True, padding_value=0
    )
    d['labels'] = torch.stack(d['labels'])
    return d

In [19]:
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train_data,
    eval_dataset = tokenized_validation_data,
    data_collator = functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics = compute_metrics,
    label_weights = label_weights,
)

In [None]:
trainer.train()

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Accuracy,Log Loss,Precision,Recall,F1,Runtime,Samples Per Second,Steps Per Second
800,0.513,0.170869,0.9837,0.057399,0.908063,0.967095,0.93665,1796.5673,5.566,0.696
1600,0.271,0.230721,0.9844,0.095538,0.913505,0.966292,0.939158,1796.0772,5.568,0.696


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
