<a href="https://colab.research.google.com/github/krittaprot/kaggle-gemma-peft/blob/main/QLoRA_Fine_Tuning_Gemma_Kaggle_Assistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# (QLora) Fine-tuning Gemma-7b-Instruct to be a Kaggle Platform AI Assistant

Team: Context Crafters <br>
Members: <br>
1.   Krittaprot Tangkittikun
2.   Kevin Simon Ireri Kori
3.   James Mbugua Mungai


Adapted from: [Aisuko's Notebook](https://www.kaggle.com/code/aisuko/llm-prompt-recovery-with-gemma)

In [None]:
# @title Uncomment to Install Required Libraries
# !pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
# !pip install -q datasets bitsandbytes einops wandb

In [None]:
# @title Pick LLM Model
import os
from google.colab import userdata
import torch

#set up project metadata
os.environ["MODEL_NAME"] = "google/gemma-7b-it"

#set up token for accessing huggingface api
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

#set token parallelism for speed
os.environ["TOKENIZERS_PARALLELISM"] = "true"

#use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'The device available is {device}.')

The device available is cuda.


In [None]:
import json
import numpy as np
import pandas as pd

# @title Load the Training and Testing Data
def load_data(filepath):
  with open(f'{filepath}','r') as f:
    data = json.load(f)
  df = pd.DataFrame(data)
  return df

#path to the dataset
training_data_path = "/content/training_data_kaggle_qa.json" #@param {type:"string"}
testing_data_path = "/content/testing_data_kaggle_qa.json" #@param {type:"string"}

#load the data as dataframes
train_df = load_data(training_data_path)
test_df = load_data(testing_data_path)

In [None]:
# @title Import Tokenizer
from transformers import AutoTokenizer

#set up the tokenizer with fast tokenizer if compatible with the model
tokenizer = AutoTokenizer.from_pretrained(os.getenv("MODEL_NAME"), use_fast=True, padding_side = "left")

In [None]:
# @title Data Preprocessing
import re
from datasets import load_dataset, Dataset

# clean & format the plain text data
def clean_text(text: str) -> str:
    text = re.sub(r'<[^>]+>', '', text) # remove HTML/Markdown tags
    text = re.sub(r'@\w+', '', text) # remove @user tags
    text = text.replace('\n', ' ') # remove newline characters
    text = re.sub(r'\s+', ' ', text) # remove multiple spaces
    text = text.strip() # remove leading and trailing spaces
    return text

# Clean text data in all relevant columns
for df in (train_df, test_df):
    for column in ['instruction', 'input', 'output']:
        df[column] = df[column].apply(clean_text)

def map_data(df):
  data=Dataset.from_pandas(df)
  data=data.map(lambda samples: tokenizer(samples["input"]), batched=True)
  data=data.map(lambda samples: tokenizer(samples["instruction"]), batched=True)
  data=data.map(lambda samples: tokenizer(samples["output"]), batched=True)
  return data

train = map_data(train_df)
test = map_data(test_df)

Map:   0%|          | 0/1663 [00:00<?, ? examples/s]

Map:   0%|          | 0/1663 [00:00<?, ? examples/s]

Map:   0%|          | 0/1663 [00:00<?, ? examples/s]

Map:   0%|          | 0/793 [00:00<?, ? examples/s]

Map:   0%|          | 0/793 [00:00<?, ? examples/s]

Map:   0%|          | 0/793 [00:00<?, ? examples/s]

In [None]:
# @title Load the Model
# Import necessary classes from the transformers library
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

# Configure the model for 4-bit computation using the BitsAndBytes library.
# This includes specifying the type of quantization and the data type for computation.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                            # Enable loading the model in 4-bit precision
    bnb_4bit_quant_type="nf4",                    # Set quantization type to 'nf4' for noise-free 4-bit quantization
    bnb_4bit_use_double_quant=True,               # Use double quantization technique for improved accuracy
    bnb_4bit_compute_dtype=torch.bfloat16         # Use bfloat16 as the compute data type for better performance
)

# Load the model with the specified quantization configuration.
# The model is loaded with a floating point precision and placed on the most suitable device automatically.
model = AutoModelForCausalLM.from_pretrained(
    os.getenv("MODEL_NAME"),                      # Load model name from environment variables
    quantization_config=bnb_config,               # Apply the quantization configuration
    torch_dtype=torch.bfloat16,                   # Set the default data type for tensors to bfloat16
    device_map="auto"                             # Automatically map model layers to available devices (GPUs/CPUs)
)

# Set the end-of-sequence token ID from the tokenizer to the model configuration.
# This is important for the model to recognize the end of input sequences.
model.config.eos_token_id = tokenizer.eos_token_id

# Enable gradient checkpointing to reduce memory usage during training by trading compute for memory.
model.gradient_checkpointing_enable()

def print_trainable_parameters(model):
    trainable_params=0
    all_params=0
    for _, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_params} || trainable%: {100 * trainable_params/all_params:.2f}")

print_trainable_parameters(model)

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

trainable params: 786607104 || all params: 4662144000 || trainable%: 16.87


In [None]:
# @title Testing Out-of-the-Box Model Response
#set the model up for inference mode (freeze weights and disable grad updates)
model.eval()

#provide the question (input) and instruction for the chatbot
Question = "How to submit a file to a kaggle competition via API?"  #@param {type:"string"}
Instruction = "Please answer the following question." #@param {type:"string"}

#set up the prompt for the model
prompt=f'''<start_of_turn>user Below is the instruction. \n\n###
Instruction:\n{Instruction}\n###
Input:\n{Question} <end_of_turn>
<start_of_turn>'''

#tokenize the prompt
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(**input_ids, max_new_tokens=400)
print(tokenizer.decode(outputs[0]))

In [None]:
# @title Look up Manipulable Layers for LoRA
from transformers import Conv1D

def get_specific_layer_names(model):
    # Create a list to store the layer names
    layer_names = []

    # Recursively visit all modules and submodules
    for name, module in model.named_modules():
        # Check if the module is an instance of the specified layers
        if isinstance(module, (torch.nn.Linear, torch.nn.Embedding, torch.nn.Conv2d, Conv1D)):
            # model name parsing
            layer_names.append('.'.join(name.split('.')[4:]).split('.')[0])
    return layer_names

print(f'The attachable layers for LoRA include {list(set(get_specific_layer_names(model)))}')

The attachable layers for LoRA include ['', 'q_proj', 'v_proj', 'o_proj', 'down_proj', 'up_proj', 'gate_proj', 'k_proj']


In [None]:
# @title Prepare the quantized model for LoRA Fine-Tuning
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, TaskType, get_peft_model
from transformers import TrainingArguments, set_seed, EarlyStoppingCallback
from trl import SFTTrainer

# Set a fixed seed for reproducibility
set_seed(2024)

# LoRA-specific hyperparameters
r = 128 #@param {type:"integer"}
lora_alpha = 256 #@param {type:"integer"}
lora_dropout = 0.15 #@param {type:"number"}

# Other hyperparameters
batch_size = 8 #@param {type:"integer"}
ga_steps = 16 #@param {type:"integer"}
lr = 0.0002 #@param {type:"number"}
num_epochs = 3 #@param {type:"integer"}
max_grad_norm = 0.3 # @param {type:"number"}
warmup_ratio = 0.03 # @param {type:"number"}
weight_decay = 0.01 # @param {type:"number"}
max_seq_length = 512 #@param {type:"integer"}
save_total_limit = 5  #@param {type:"integer"}

# Optimizers & Schedulers
lr_scheduler_type = "cosine" #@param {type:"string"}
optim = "paged_adamw_32bit" #@param {type:"string"}

# Function to preprocess the data according to a specific format
def preprocess_func(example):
    formatted_text = (
        f"Context:\n {example['input'][0]}\n\n"
        f"Question:\n {example['instruction'][0]}\n\n"
        f"Answer:\n {example['output'][0]}"
    )
    return [formatted_text]

model=prepare_model_for_kbit_training(
    model, use_gradient_checkpointing=True
)

lora_config = LoraConfig(
    r=r,
    lora_alpha=lora_alpha,
    target_modules=['q_proj', 'up_proj','down_proj', 'gate_proj', 'o_proj', 'k_proj', 'v_proj'], #excluded to save mem: [" "]
    lora_dropout=lora_dropout,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

#the trainable parameters should become 0 as the weights are frozen
model=get_peft_model(model, lora_config)
model.config.use_cache=False

# Assuming 'train' is your training dataset
num_train_samples = len(train)

# Calculate effective batch size
effective_batch_size = batch_size * ga_steps

# Calculate steps per epoch based on the dataset size
steps_per_epoch = num_train_samples // effective_batch_size

# Calculate eval_steps
eval_steps = steps_per_epoch//2
save_steps = 2 * eval_steps

# TrainingArguments configuration
args = TrainingArguments(
    output_dir="model",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="steps",
    logging_steps=1,
    eval_steps=eval_steps,
    save_steps=save_steps,
    gradient_accumulation_steps=ga_steps,
    num_train_epochs=num_epochs,
    lr_scheduler_type=lr_scheduler_type,
    optim=optim,  # Using paged_adamw_32bit to avoid NaN values in loss
    learning_rate=lr,
    group_by_length=True,
    fp16=False,
    bf16=False,
    ddp_find_unused_parameters=False,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    weight_decay=weight_decay,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={'use_reentrant': False},
    report_to=[],
    load_best_model_at_end= True,  # Required by EarlyStoppingCallback
    save_total_limit= save_total_limit # Optionally, limit the number of saved checkpoints
)

# Setting up the trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=train,
    eval_dataset=test,
    formatting_func=preprocess_func,
    max_seq_length=max_seq_length,
    packing=False,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Assuming 'lora_model' is your model
model.cuda()  # Make sure the model is on GPU

# Print memory usage
print(f"Total memory allocated on GPU: {torch.cuda.memory_allocated('cuda:0') / 1e9:.2f} GB")
print(f"Total memory cached on GPU: {torch.cuda.memory_reserved('cuda:0') / 1e9:.2f} GB")


Total memory allocated on GPU: 8.78 GB
Total memory cached on GPU: 8.79 GB


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
# @title Train the Model
# Disabling cache usage during training for clarity, re-enable for inference
trainer.model.config.use_cache = False

# Start training the model
trainer.train()

# renable warnings
trainer.model.config.use_cache = True

Step,Training Loss,Validation Loss
6,11.8397,5.448492
12,3.8106,4.71679
18,1.9397,3.109334
24,1.8329,2.859618
30,1.0768,2.822861
36,1.0072,2.771529


In [None]:
# @title Push the model to huggingface hub
from huggingface_hub import notebook_login
notebook_login()

# # option 2: key login
# from huggingface_hub import login
# write_key = 'hf_' # paste token here
# login(write_key)

hf_name = 'Krittaprot'  # @param {type:"string"}
id = 'gemma-7b-ft-kaggle-qa' # @param {type:"string"}
model_id = hf_name + "/" + id

model.push_to_hub(model_id)
# trainer.push_to_hub(model_id)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

adapter_model.safetensors:   0%|          | 0.00/1.60G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Krittaprot/gemma-7b-ft-kaggle-qa/commit/f8d01fe47a90f7b5524d2046a3d9dc3c4c89b077', commit_message='Upload model', commit_description='', oid='f8d01fe47a90f7b5524d2046a3d9dc3c4c89b077', pr_url=None, pr_revision=None, pr_num=None)

### Load the Model with Trained LoRA Adapter

In [None]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftConfig

os.environ["MODEL_NAME"] = "google/gemma-7b-it"

model_id = os.environ["MODEL_NAME"]

#load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)


# Configure the model for 4-bit computation using the BitsAndBytes library.
# This includes specifying the type of quantization and the data type for computation.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                            # Enable loading the model in 4-bit precision
    bnb_4bit_quant_type="nf4",                    # Set quantization type to 'nf4' for noise-free 4-bit quantization
    bnb_4bit_use_double_quant=True,               # Use double quantization technique for improved accuracy
    bnb_4bit_compute_dtype=torch.bfloat16         # Use bfloat16 as the compute data type for better performance
)

# Load the model with the specified quantization configuration.
# The model is loaded with a floating point precision and placed on the most suitable device automatically.
model = AutoModelForCausalLM.from_pretrained(
    os.getenv("MODEL_NAME"),                      # Load model name from environment variables
    quantization_config=bnb_config,               # Apply the quantization configuration
    torch_dtype=torch.bfloat16,                   # Set the default data type for tensors to bfloat16
    device_map="auto"                             # Automatically map model layers to available devices (GPUs/CPUs)
)

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
#specify the adapter address
adapter_model_id = "Krittaprot/gemma-7b-ft-kaggle-qa"

#load the adapter and attach to the model
model.load_adapter(adapter_model_id)
print(f'The adapter is successfully loaded!')

The adapter is successfully loaded!


In [None]:
# @title Perform Inference with the Fine-Tuned Model

def get_ft_model_inference_with_context(ft_model, context, question):
    ft_model.eval()
    # Adjust the prompt to guide the model to generate only the answer
    prompt_text = f"Answer the following question based on the context provided:\nContext: {context}\nQuestion: {question}\nAnswer:"
    with torch.no_grad():
        input_ids = tokenizer(prompt_text, return_tensors="pt").to("cuda")
        outputs = ft_model.generate(**input_ids, max_new_tokens=400)
        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Optionally, post-process to extract only the answer part if necessary
        answer_start = answer.find("Answer:") + len("Answer:")
        return answer[answer_start:].strip()

# Example usage
context = "You are an expert who mainly help users better understand the Kaggle Platform and Kaggle Competition. You should answer the question in a clear and concise manner. Make sure to include a traceable reference when possible."
question = "Please share with me the link to the most popular kaggle competition!"  #@param {type:"string"}
response = get_ft_model_inference_with_context(model, context, question)
print(response)

The most popular Kaggle competition is "Titanic: Machine Learning from Disaster." The link to the competition is: https://www.kaggle.com/competitions/titanic. This information is from the official Kaggle website. Refer to the official website for the latest information.
