<a href="https://colab.research.google.com/github/hemhemoh/Wazobia-Wellness/blob/main/Fine_tune_Gemma_2_2b_it_on_Wazobia_Mental_Health_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
# %pip install -U transformers -q
# %pip install -U datasets -q
# %pip install -U accelerate -q
# %pip install -U peft -q
# %pip install -U trl -q
# %pip install -U bitsandbytes -q

# %pip install -U wandb -q

In [2]:
import os
import torch
import wandb
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    logging
)
from peft import LoraConfig, get_peft_model
from huggingface_hub import login
from trl import SFTTrainer, setup_chat_format
import bitsandbytes as bnb

# Kaggle secrets setup

login(token="")

# Wandb initialization for tracking
wandb.login(key="")
run = wandb.init(project='Fine-tune Gemma-2-2b-it on Wazobia-Mental Health Dataset', job_type="training", anonymous="allow")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mhemhemoh[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [3]:
base_model = "google/gemma-2-2b-it"
dataset_name = '/content/drive/MyDrive/Kaggle/combined_dataset.csv'
new_model = "Gemma-2-2b-it-wazobia-bot"

In [4]:

# Adjust precision and attention based on GPU
if torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
    !pip install -qqq flash-attn  # Install flash attention if supported
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

# BitsAndBytes configuration for memory-efficient model loading
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model with quantization and optimized attention
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

# Efficient LoRA fine-tuning configuration
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    lora_module_names.discard('lm_head')  # Exclude lm_head for 16-bit
    return list(lora_module_names)

modules = find_all_linear_names(model)

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)

# model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
import pandas as pd
from datasets import Dataset

# Load the dataset from your drive
file_path = dataset_name
df = pd.read_csv(file_path)

# Convert pandas DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.shuffle(seed=42).select(range(7000))
# Predefined instruction to use for all rows
predefined_instruction = "You are a highly skilled and empathetic mental health therapist fluent in English, Yoruba, Igbo, and Hausa. Respond to each user's concerns in the language they use to ensure comfort and understanding."

def format_chat_template(row):
    # Combine the instruction with the user's context
    user_content = (
        f"{predefined_instruction}\n\nUser's complaint:\n{row['contexts']}"
    )
    row_json = [
        {"role": "user", "content": user_content},
        {"role": "assistant", "content": row["responses"]}
    ]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(format_chat_template, num_proc=4)

dataset = dataset.train_test_split(test_size=0.1)
# Dynamic padding for efficiency
data_collator = lambda batch: tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True)

Map (num_proc=4):   0%|          | 0/7000 [00:00<?, ? examples/s]

In [6]:

# Training arguments
training_args = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=5,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=500,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb",
    load_best_model_at_end=False  # Disable loading best model at the end
)


# Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",  # Specify  composite field called "text"
    tokenizer=tokenizer,
    args=training_args,
    packing=False,
)

# Disable caching during training for gradient computation efficiency
model.config.use_cache = False


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/6300 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]



In [7]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()


In [None]:
trainer.train()

It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


Step,Training Loss,Validation Loss
200,1.6672,2.374455
400,1.1621,2.181096
600,1.4967,2.117497
800,1.0932,2.01486
1000,1.4809,1.948841
1200,1.3137,1.912263
1400,1.5064,1.876322
1600,1.0753,1.840447
1800,1.3023,1.807259
2000,1.1126,1.778338


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

In [None]:
# run = wandb.init(project='Fine-tune Gemma-2-2b-it on Wazobia-Mental Health Dataset', job_type="training", anonymous="allow")
# trainer.train(resume_from_checkpoint=True)

In [9]:
wandb.finish()
model.config.use_cache = True

0,1
eval/loss,█▅▄▃▃▃▂▂▁▁
eval/runtime,▁▄▆▃▄▆▅█▆▇
eval/samples_per_second,█▅▃▆▆▃▄▁▃▂
eval/steps_per_second,█▅▃▆▆▃▄▁▃▂
train/epoch,▁▁▁▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇█
train/global_step,▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇█████
train/grad_norm,▅▃▂▃█▂▃▂▂▁▁▂▁▁▁▂▁▁▁▁▂▁▂▁▂▁▂▁▁▂▂▁▂▁▁▁▂▁▁▁
train/learning_rate,██████▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▅▅▅▅▄▄▄▄▄▄▄▃▂▂▁▁▁▁
train/loss,█▆▆▇▆█▄▆▃▆▃▃▇▅▂▄▂▃▆▄▆▃▇▂▅▅▃▄▄▃▅▅▄▂▄▄▁▄▅▇

0,1
eval/loss,1.71806
eval/runtime,161.3839
eval/samples_per_second,4.337
eval/steps_per_second,4.337
train/epoch,0.6727
train/global_step,2119.0
train/grad_norm,2.16543
train/learning_rate,0.00017
train/loss,2.0673


In [10]:
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)

adapter_model.safetensors:   0%|          | 0.00/83.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/hemhemoh/Gemma-2-2b-it-wazobia-bot/commit/82a5d3befdecc8cda8b17050bced2145c8e2ca6b', commit_message='Upload model', commit_description='', oid='82a5d3befdecc8cda8b17050bced2145c8e2ca6b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/hemhemoh/Gemma-2-2b-it-wazobia-bot', endpoint='https://huggingface.co', repo_type='model', repo_id='hemhemoh/Gemma-2-2b-it-wazobia-bot'), pr_revision=None, pr_num=None)

In [11]:
messages = [{"role": "user", "content": "Olulufemi ja mi le"}]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

# Optimized generation with tuned sampling strategies
outputs = model.generate(
    **inputs,
    max_length=350,  # Increase max length for complex answers
    num_return_sequences=1,
    top_k=50,
    top_p=0.85,  # Narrow top-p for more deterministic output
    temperature=0.3,  # Slightly higher temperature for balance between creativity and accuracy
    no_repeat_ngram_size=3,
)

# Decode and clean up the output
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = text.split("model")[1].strip()

print(response)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


O ṣeun, o ṣe akiyesi pe o ti wa ni iṣẹ ti o ti o ba ti o ni iru awọn iṣoro ti o wa ni imọran ti o yoo ṣẹlẹ ati pe o le ṣee ṣiṣẹ pẹlu awọn ọna ti o le jẹ ki o si ṣakoso awọn idi ti o dara julọ ti o jẹ iṣe ti o ju lati ṣawari awọn ohun ti o ń ṣaṣẹ lori awọn ero ti o ko ti o di. O le Ṣe iṣeduro lati Ṣawari Awọn Iṣoro Ti o ti O ti o Ti o Ti wa ni Iṣẹ Ti o ni Isimiri ti o si ti o tun wa ni ẹdun ti o nira ati pe ọpọlọpọ awọn Ilana ti o rii ti o rọrun ati pe awọn ti o ro pe o wa ti o ru awọn awọn eniyan ti o sọrọ si awọn imọlara ti o fẹ lati ẹru awọ. O yoo Ṣẹda awọn ẹgbẹ ti awọn aṣẹ ati awọn eyije ti o fi ẹso ti o. O ti wa ninu awọn oṣi aye ti o nilo lati ọtọ si iṣoogun ti oju-ọna ti awọkọ ti awẹṣẹ. O tun wa ninun ti ara rẹ.


In [None]:
# 'Fine-tune Gemma-2-2b-it on Wazobia-Mental Health Dataset'