<a href="https://colab.research.google.com/github/kpurang/expts/blob/main/fineTuneQuantizedGemma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine tuning gemma 7b

This is derived [from this notebook.](https://colab.research.google.com/drive/1VoYNfYDKcKRQRor98Zbf2-9VQTtGJ24k?usp=sharing)

## Setup
Installing packages and mounting google drive.

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets
!pip install -q -U trl

%load_ext tensorboard

import os
import pandas as pd
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from trl import SFTTrainer,  DataCollatorForCompletionOnlyLM
from datasets import Dataset, load_dataset

from google.colab import drive
drive.mount('/content/drive')
os.makedirs('./logs', exist_ok=True)

## Constants
Defining constants in one place.

In [None]:

NEWMODELNAME = 'gemma_7b_ft'
BASEDIR = '/content/drive/My Drive/Colab Notebooks/'
OUTDIR = os.path.join(BASEDIR, NEWMODELNAME)
LOSSFILE = os.path.join(BASEDIR, f"losses/{NEWMODELNAME}.csv")
TRAINDATA_FILE= 'dataset/train.csv'
VALDATA_FILE= 'dataset/val.csv'
TEXT_FIELD='prompt'

MODEL_ID = 'google/gemma-7b-it'
# bnbConfig
LOAD_IN_4BIT = True
BNB_4BIT_USE_DOUBLE_QUANT = True
BNB_4BIT_QUANT_TYPE = 'nf4'
BNB_4BIT_COMPUTE_DTYPE = torch.bfloat16
# LORA config
LORA_R = 8
LORA_ALPHA = 16
LORA_TARGET_MODULES = ['q_proj', 'k_proj', 'v_proj', 'o_proj']
LORA_DROPOUT = 0.05
LORA_BIAS = 'none'
LORA_TASK_TYPE = 'CAUSAL_LM'
# Trainer args
TRAIN_PER_DEVICE_TRAIN_BATCH_SIZE = 1
TRAIN_GRADIENT_ACCUMULATION_STEPS = 4
TRAIN_NUM_TRAIN_EPOCHS = 5
TRAIN_WARMUP_STEPS = 2
TRAIN_LEARNING_RATE = 2e-4
TRAIN_FP16 = True
TRAIN_LOGGING_STEPS=1
TRAIN_OUTPUT_DIR = 'outputs'
TRAIN_OPTIM = 'paged_adamw_8bit'


In [None]:
# 1. specify the quantization configuration
# 2. get the tokenizer and quantized model

def get_tokenizer_model():
  model_id = MODEL_ID
  bnb_config = BitsAndBytesConfig(
        load_in_4bit = LOAD_IN_4BIT,
        bnb_4bit_use_double_quant = BNB_4BIT_USE_DOUBLE_QUANT,
        bnb_4bit_quant_type=BNB_4BIT_QUANT_TYPE,
        bnb_4bit_compute_dtype = BNB_4BIT_COMPUTE_DTYPE,
  )
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  model = AutoModelForCausalLM.from_pretrained(model_id,
                                              quantization_config=bnb_config,
                                              device_map={"":0}
                                              )
  return tokenizer, model



In [None]:
# add lora adaptors to the model

def add_lora(model):
  model.gradient_checkpointing_enable()
  model = prepare_model_for_kbit_training(model)
  config = LoraConfig(
      r=LORA_R ,
      lora_alpha=LORA_ALPHA ,
      target_modules=LORA_TARGET_MODULES ,
      lora_dropout=LORA_DROPOUT ,
      bias=LORA_BIAS ,
      task_type=LORA_TASK_TYPE
  )

  model = get_peft_model(model, config)
  return model


In [None]:
# load datset

def genTrainPrompt(row):
  # given a sample, this returns the training promt
  return f"""

"""

def genValPrompt(row):
  # given a row, this returns the validating prompt
  return f"""
"""

def get_datasets(trainFile=os.path.join(BASEDIR, TRAINDATA_FILE),
                valFile=os.path.join(BASEDIR, VALDATA_FILE)):
  trainDf = pd.read_csv(trainFile, index_col=0, header=0)
  print(f"Dataset len: {len(trainDf)}")
  tpDfSer = trainDf.apply(lambda x: genTrainPrompt(x), axis=1)
  tpDf = pd.DataFrame(tpDfSer, columns=['prompt'])

  valDf = pd.read_csv(valFile, index_col=0, header=0)
  vpDfSer = valDf.apply(lambda x: genValPrompt(x), axis=1)
  vpDf = pd.DataFrame(vpDfSer, columns=['prompt'])

  trainDataset = Dataset.from_pandas(tpDf)
  valDataset = Dataset.from_pandas(vpDf)
  return trainDataset, valDataset



In [None]:


def get_trainer(tokenizer, model, trainDataset, valDataset):
  tokenizer.pad_token = tokenizer.eos_token
  response_template = "### PROMPT:"
  collator =  DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

  trainer = SFTTrainer(
      model=model,
      train_dataset = trainDataset,
      eval_dataset = valDataset,
      dataset_text_field=TEXT_FIELD,
      args = transformers.TrainingArguments(
          per_device_train_batch_size=TRAIN_PER_DEVICE_TRAIN_BATCH_SIZE ,
          gradient_accumulation_steps=TRAIN_GRADIENT_ACCUMULATION_STEPS ,
          num_train_epochs = TRAIN_NUM_TRAIN_EPOCHS,
          warmup_steps=TRAIN_WARMUP_STEPS ,
          #max_steps=10,
          learning_rate=TRAIN_LEARNING_RATE ,
          fp16=TRAIN_FP16 ,
          logging_steps=TRAIN_LOGGING_STEPS,
          output_dir='./logs',
          optim=TRAIN_OPTIM ,
      ),
      data_collator=collator,
  )
  model.config.use_cache = False # turn on for inference
  return trainer

In [None]:
def start_train():
  try:
    os.makedirs(OUTDIR, exist_ok=False)
  except Exception as e:
    print('Output directory may exist.\n', e)
  trainDataset, valDataset = get_datasets()
  print('Done Datasets')
  tokenizer, model = get_tokenizer_model()
  print('Done base model')
  model = add_lora(model)
  print('DOne add Lora')
  trainer = get_trainer(tokenizer, model, trainDataset, valDataset)
  print('Done trainer')
  trainer.train()
  print('Done train')
  trainer.model.save_pretrained(OUTDIR)
  print('Done save model')
  lossDF = pd.DataFrame(trainer.state.log_history)
  lossDF.to_csv(LOSSFILE, index=True, header=True)
  print('Saved losses')



In [None]:
with torch.no_grad():
  torch.cuda.empty_cache()
%tensorboard --logdir ./logs
start_train()
