In [1]:
!pip install -U "transformers>=4.42.3" bitsandbytes accelerate peft

Collecting transformers>=4.42.3
  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting peft
  Downloading peft-0.13.0-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers>=4.42.3)
  Downloading tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.45.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m91.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-

In [2]:
import os
import copy
from dataclasses import dataclass

import numpy as np
import pandas as pd
import pandas as pd
import torch
import functools
from torch.nn import functional as F
from torch.utils.data import DataLoader
from datasets import Dataset, DatasetDict
from transformers import (
    BitsAndBytesConfig,
    Gemma2ForSequenceClassification,
    GemmaTokenizerFast,
    Gemma2Config,
)
from peft import LoraConfig, PeftModel

In [3]:
@dataclass
class Config:
    gemma_dir = 'unsloth/gemma-2-9b-it-bnb-4bit'
    lora_dir = '/kaggle/input/1epoch-8freeze-qdora-finetuned-dls/output/checkpoint-1036'
    max_length = 2048
    batch_size = 4
    device = torch.device('cuda')    

cfg = Config()

In [4]:
test = pd.read_csv('/kaggle/input/dls-data/test.csv.csv').drop(columns=['Unnamed: 0'])

In [5]:
test['tags'].fillna('{PASS}', inplace=True)
test['tags'] = test['tags'].apply(lambda row: row[1:-1])
test['assessment'] = test['assessment'].astype(str)
test['total'] = test['assessment'].str.cat(test['tags'], sep=' <sep> ')
test['total'] = test['total'].str.cat(test['text'], sep=' <sep> ')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['tags'].fillna('{PASS}', inplace=True)


In [6]:
test.drop(
    columns=[
        'index',
        'assessment',
        'text',
        'tags'
    ],
    axis=1,
    inplace=True
)

In [7]:
tokenizer = GemmaTokenizerFast.from_pretrained(cfg.gemma_dir)
tokenizer.add_eos_token = True
tokenizer.add_sep_token = True
tokenizer.padding_side = 'right'

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [8]:
model = Gemma2ForSequenceClassification.from_pretrained(
    cfg.gemma_dir,
    num_labels=50,
    torch_dtype=torch.bfloat16,
    device_map=cfg.device,
    use_cache=False,
)

model = PeftModel.from_pretrained(model, cfg.lora_dir)

config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at unsloth/gemma-2-9b-it-bnb-4bit and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
ds = DatasetDict({
    'test': Dataset.from_dict({'text': [str(x) for x in test['total'].tolist()]})
})

In [10]:
def tokenize_examples(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['text'], max_length=cfg.max_length, truncation=True)
    return tokenized_inputs

ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer), batched=True)
ds = ds.with_format('torch')

Map:   0%|          | 0/9015 [00:00<?, ? examples/s]

In [11]:
def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]

    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask
    }

In [12]:
test_dataloader = DataLoader(ds['test'], batch_size=cfg.batch_size, collate_fn=collate_fn)

In [13]:
from tqdm import tqdm

In [14]:
model.eval()
all_logits = []

with torch.no_grad():
    for batch in tqdm(test_dataloader):
        inputs = {k: v.to(model.device) for k, v in batch.items()}
        logits = model(**inputs).logits
        all_logits.append(logits.cpu())

all_logits = torch.cat(all_logits, dim=0)
probs = torch.sigmoid(all_logits).float().numpy()

100%|██████████| 2254/2254 [1:42:45<00:00,  2.74s/it]


In [15]:
with open(f'1epoch_gemma_2048_bf16.npy', 'wb') as f:
    np.save(f, probs)