In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
! pip install accelerate -q
! pip install -i https://pypi.org/simple/ bitsandbytes -q
! pip install peft -q
! pip install trl -q
! pip install --upgrade huggingface_hub -q
! pip install git+https://github.com/huggingface/datasets -U -q
! pip install git+https://github.com/huggingface/transformers -U -q

In [3]:
import os
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
access_token_read = user_secrets.get_secret("HF_TOKEN")
login(token = access_token_read)

In [4]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    GemmaTokenizer,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [5]:
base_model = "google/gemma-2-2b"
dataset_name = "harishnair04/mtsamples"
new_model = "Gemma-medtr-2b-sft-v2"

In [6]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)


In [7]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    token=access_token_read
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, token=access_token_read, trust_remote_code=True)

config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/481M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Fine-tuning the full model will take a lot of time, so to accelerate the training process, we will create and attach the adapter layer, resulting in a faster and more memory-efficient process. 

In [8]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [9]:
# LoRA config
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Loading the dataset

In [10]:
# Importing the dataset
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=65)

def format_chat_template(row):
    # Ensure none of the values are None
    description = row["description"] if row["description"] is not None else ""
    transcription = row["transcription"] if row["transcription"] is not None else ""
    keywords = row["keywords"] if row["keywords"] is not None else ""
    
    row_json = [
        {"role": "system", "content": description},
        {"role": "user", "content": transcription},
        {"role": "assistant", "content": keywords}
    ]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc=4,
)

dataset

README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

mtsamples.csv:   0%|          | 0.00/17.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4999 [00:00<?, ? examples/s]

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/4999 [00:00<?, ? examples/s]

Dataset({
    features: ['Unnamed: 0', 'description', 'medical_specialty', 'sample_name', 'transcription', 'keywords', 'text'],
    num_rows: 4999
})

In [11]:
dataset['text'][30]

"<|im_start|>system\n Followup left-sided rotator cuff tear and cervical spinal stenosis.  Physical examination and radiographic findings are compatible with left shoulder pain and left upper extremity pain, which is due to a combination of left-sided rotator cuff tear and moderate cervical spinal stenosis.<|im_end|>\n<|im_start|>user\nREASON FOR VISIT: , Followup left-sided rotator cuff tear and cervical spinal stenosis.,HISTORY OF PRESENT ILLNESS: , Ms. ABC returns today for followup regarding her left shoulder pain and left upper extremity C6 radiculopathy.  I had last seen her on 06/21/07.,At that time, she had been referred to me Dr. X and Dr. Y for evaluation of her left-sided C6 radiculopathy.  She also had a significant rotator cuff tear and is currently being evaluated for left-sided rotator cuff repair surgery, I believe on, approximately 07/20/07.  At our last visit, I only had a report of her prior cervical spine MRI.  I did not have any recent images.  I referred her for c

For model evaluation, we will split out the dataset into training and test split. 

In [12]:
dataset = dataset.train_test_split(test_size=0.1)

training the model

In [13]:
# Setting Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    num_train_epochs=2,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="epoch",
    learning_rate=2e-4,
    torch_empty_cache_steps = 500,
    fp16=False,
    bf16=False,
    group_by_length=True,
    save_strategy="epoch",
    report_to="wandb",
)
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length= 512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

model.config.use_cache = False
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/4499 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111356252222322, max=1.0)…

Step,Training Loss,Validation Loss
1800,No log,1.695078
3600,No log,1.52455
5400,1.656600,1.373628
7200,1.656600,1.259705


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=8998, training_loss=1.337776233260169, metrics={'train_runtime': 21163.5373, 'train_samples_per_second': 0.425, 'train_steps_per_second': 0.425, 'total_flos': 5.081403999852749e+16, 'train_loss': 1.337776233260169, 'epoch': 2.0})

evaluating the model performance

In [14]:
wandb.finish()
model.config.use_cache = True

VBox(children=(Label(value='0.028 MB of 0.028 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▅▃▁
eval/runtime,██▁▆
eval/samples_per_second,▁▁█▁
eval/steps_per_second,▁▁█▁
train/epoch,▁▃▄▅▆██
train/global_step,▁▃▄▅▆██
train/grad_norm,█▁
train/learning_rate,█▁
train/loss,█▁

0,1
eval/loss,1.25971
eval/runtime,476.0954
eval/samples_per_second,1.05
eval/steps_per_second,1.05
total_flos,5.081403999852749e+16
train/epoch,2.0
train/global_step,8998.0
train/grad_norm,1.44129
train/learning_rate,0.0
train/loss,1.019


In [15]:
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)



adapter_model.safetensors:   0%|          | 0.00/2.40G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/harishnair04/Gemma-medtr-2b-sft-v2/commit/a7b8f158c3518ccf1d91f60e69b9afdf13919c36', commit_message='Upload model', commit_description='', oid='a7b8f158c3518ccf1d91f60e69b9afdf13919c36', pr_url=None, repo_url=RepoUrl('https://huggingface.co/harishnair04/Gemma-medtr-2b-sft-v2', endpoint='https://huggingface.co', repo_type='model', repo_id='harishnair04/Gemma-medtr-2b-sft-v2'), pr_revision=None, pr_num=None)

In [16]:
# base_model_reload= AutoModelForCausalLM.from_pretrained(
#     base_model,
#     low_cpu_mem_usage=True,
#     return_dict=True,
#     torch_dtype=torch.bfloat16,
#     device_map="cpu",
# )

In [17]:
# tokenizer = AutoTokenizer.from_pretrained(base_model)

In [18]:
# base_model_reload = setup_chat_format(base_model_reload)
# model = PeftModel.from_pretrained(base_model_reload, "/kaggle/working/Gemma-medtr-2b-sft-v1")

In [19]:
# tokenizer = setup_chat_format(tokenizer)

In [20]:
# model = model.merge_and_unload()