In [None]:
! pip install -q -U transformers
! pip install -q -U peft
! pip install -q -U bitsandbytes
! pip install -q -U datasets
! pip install -q -U trl
! pip install -q -U accelerate
! pip install -q -U wandb
! pip install -q -U psutil
! pip install  -q  GPUtil


In [None]:
import os
import torch
from datasets import load_dataset,Dataset, DatasetDict, load_from_disk
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
import transformers
from transformers.integrations import WandbCallback
import random
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
print('import bnb')
import bitsandbytes as bnb

# from GPUtil import showUtilization as gpu_usage

import wandb
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import seaborn as sns

import pandas as pd
import time
import math
import gc

In [None]:
print('torch version: ', torch.__version__)
print(f'transformers version: {transformers.__version__}')
print(f'bnb version: {bnb.__version__}')

In [None]:
class CFG:
  # The model that you want to train from the Hugging Face hub
  # model_name = "NousResearch/Llama-2-7b-chat-hf"
  #model_name = 'meta-llama/Llama-2-7b-hf'
  model_name = 'NousResearch/Llama-2-7b-chat-hf'
  seed = 42
  val_split = 0.1
  debug = True
  resume_train = False

  # The instruction dataset to use
  dataset_name = "Sathya-20/Filtered_Medical_Dataset_final"


  project= 'LLM finetuning'


  ################################################################################
  # QLoRA parameters
  ################################################################################

  # LoRA attention dimension
  lora_r = 8

  # Alpha parameter for LoRA scaling
  lora_alpha = 16

  # Dropout probability for LoRA layers
  lora_dropout = 0.1

  ################################################################################
  # bitsandbytes parameters
  ################################################################################

  # Activate 4-bit precision base model loading
  use_4bit = True

  # Compute dtype for 4-bit base models
  bnb_4bit_compute_dtype = "float16"

  # Quantization type (fp4 or nf4)
  bnb_4bit_quant_type = "nf4"

  # Activate nested quantization for 4-bit base models (double quantization)
  use_nested_quant = False

  ################################################################################
  # TrainingArguments parameters
  ################################################################################

  # Output directory where the model predictions and checkpoints will be stored
  output_dir = './Checkpoints/'#"./results"
  report_path = './report_table.csv'

  # Number of training epochs
  num_train_epochs = 1

  # Enable fp16/bf16 training (set bf16 to True with an A100)
  fp16 = True
  bf16 = False

  # Batch size per GPU for training
  per_device_train_batch_size = 4

  # Batch size per GPU for evaluation
  per_device_eval_batch_size = 4

  # Number of update steps to accumulate the gradients for
  gradient_accumulation_steps = 2

  # Enable gradient checkpointing
  gradient_checkpointing = True

  # Maximum gradient normal (gradient clipping)
  max_grad_norm = 0.3

  # Initial learning rate (AdamW optimizer)
  learning_rate = 2e-4

  # Weight decay to apply to all layers except bias/LayerNorm weights
  weight_decay = 0.001

  # Optimizer to use
  optim = "paged_adamw_32bit"

  # Learning rate schedule
  lr_scheduler_type = "constant"



  # Ratio of steps for a linear warmup (from 0 to learning rate)
  warmup_ratio = 0.03

  # Group sequences into batches with same length
  # Saves memory and speeds up training considerably
  group_by_length = True

  # Number of training steps (overrides num_train_epochs)
  max_steps = 10 if debug else 100

  run_name = model_name.split('/')[-1] + '-' + dataset_name.split('/')[-1] + f'_{max_steps}_steps'
  max_steps = max_steps + [int(x.split('-')[-1]) for x in os.listdir(output_dir) if 'checkpoint' in x][0] if resume_train else max_steps



  # Number of updates steps before saving checkpoint
  save_steps = 1 if debug else 5

  # Save checkpoint every X updates steps
  save_steps = 1 if debug else 5

  # Log every X updates steps
  logging_steps = 1 if debug else 5

  log_table_steps = [x for x in range(1, max_steps, math.floor(max_steps/3))]

  ################################################################################
  # SFT parameters
  ################################################################################

  # Maximum sequence length to use
  max_seq_length = 1024

  # Pack multiple short examples in the same input sequence to increase efficiency
  packing = True

  # Load the entire model on the GPU 0
  device_map = 'auto'

In [None]:
class clr:
    S = '\033[1m' + '\033[94m'
    E = '\033[0m'
    R = '\033[31m'
    G = '\033[1;32m'
    Y = '\033[33m'

my_colors = ["#5EAFD9", "#449DD1", "#3977BB",
             "#2D51A5", "#5C4C8F", "#8B4679",
             "#C53D4C", "#E23836", "#FF4633", "#FF5746"]
CMAP1 = ListedColormap(my_colors)

print(clr.S+"Notebook Color Schemes:"+clr.E)
sns.palplot(sns.color_palette(my_colors))
plt.show()

In [None]:
import os
import wandb

# Set environment variable
os.environ["WANDB_API_KEY"] = "d62413290a366601bd34f6b1f9e6d0c983820a3d"

# Now login without needing to provide the key directly
wandb.login()


In [None]:
wandb.init(
    project = CFG.project,
    name = CFG.run_name,
    resume= CFG.resume_train,
)

In [None]:
# Load dataset (you can process it here)
dataset = load_dataset(CFG.dataset_name, split = 'train')

In [None]:
def format_text(text,line_lenght= 15):
  print_word = ''
  for i,word in enumerate(text.split(' ')):
    print_word+=word + ' '
    i += 1
    print_word =print_word + '\n' if i%line_lenght == 0 else print_word

  return print_word

In [None]:
print(clr.S+'Prompt'+clr.E)
print(format_text(dataset['Query'][0]))
print(clr.S+'\nCompletition'+clr.E)
print(format_text(dataset['Response'][0]))

In [None]:
formated_text = {'text':[]}

base_string = '''[INST]
Below is an instruction that describes a task.
Write a response that appropriately completes the request.\n\n
{user_prompt}\n\n
[/INST] {completition} </s>'''

system_prompt = '''Below is an instruction that describes a task.
Write a response that appropriately completes the request.\n\n '''

dataset = dataset.shuffle(seed=random.seed(CFG.seed))

for i,(Query, Response) in enumerate(zip(dataset['Query'],dataset['Response'])):
    text = base_string.format(user_prompt = Query,
                              completition = Response
                              )
    if i == 0:
      text_example = clr.S+'[INST]\n'+clr.E +\
            system_prompt + Query + '\n\n' +\
            clr.S+'[/INST] '+clr.E + Response + clr.S+' </s>'+clr.E

    formated_text['text'].append(text)


formate_dataset = Dataset.from_dict(formated_text)


In [None]:
n_data = 10000 if CFG.debug else len(formate_dataset)
n_data_train = int((1 - CFG.val_split) * n_data)

In [None]:
train_dataset = Dataset.from_dict(formate_dataset[:n_data_train])
valid_dataset = Dataset.from_dict(formate_dataset[n_data_train:n_data])

print(clr.S+'Debug Mode = '+clr.E, CFG.debug)
print(clr.S+'\nData Size'+clr.E)
print('Train = ',len(train_dataset), '| Valid = ', len(valid_dataset))

In [None]:
### save HF Dataset to disk
train_dataset.save_to_disk('./MedText-train')
valid_dataset.save_to_disk('./MedText-valid')
### load HF Dataset from disk
train_dataset = load_from_disk('./MedText-train')
valid_dataset = load_from_disk('./MedText-valid')
train_dataset, valid_dataset

In [None]:
compute_dtype = getattr(torch, CFG.bnb_4bit_compute_dtype)

In [None]:
if compute_dtype == torch.float16 and CFG.use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

In [None]:
num_gpus = torch.cuda.device_count()
# Get information about each GPU
print('General Report')
for i in range(num_gpus):
    gpu_properties = torch.cuda.get_device_properties(i)
    print(f"\nGPU {i}: {gpu_properties.name}")
    print(f"  Total Memory: {gpu_properties.total_memory / (1024**3):.2f} GB")
    print(f"  CUDA Version: {gpu_properties.major}.{gpu_properties.minor}")


In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=CFG.use_4bit,
    bnb_4bit_quant_type=CFG.bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=CFG.use_nested_quant,
)
print('BitsAndBytesConfig\n')
bnb_config.to_dict()


In [None]:
#https://www.kaggle.com/code/hinepo/llm-instruction-finetuning-wandb#Model
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()

    formatted_trainable_params = "{:,}".format(trainable_params).replace(",", "_")
    formatted_all_param = "{:,}".format(all_param).replace(",", "_")
    print(
        f"trainable params: {formatted_trainable_params} || all params: {formatted_all_param} || trainable: {round(100 * trainable_params / all_param, 2)} %"
    )

In [None]:
%%time
model = AutoModelForCausalLM.from_pretrained(
    CFG.model_name,
    quantization_config=bnb_config,
    device_map=CFG.device_map
)

model.config.use_cache = False
model.config.pretraining_tp = 1
print_trainable_parameters(model)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
                            CFG.model_name,
                            trust_remote_code=True
                            )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

In [None]:
model.hf_device_map

In [None]:
layer_names = list(model.state_dict().keys())

for name in layer_names[4:10]:
    print(name)

In [None]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    return list(lora_module_names)
linear_layers = find_all_linear_names(model)
print(f"Linear layers in the model: {linear_layers}")

In [None]:
peft_config = LoraConfig(
    lora_alpha=CFG.lora_alpha,
    lora_dropout=CFG.lora_dropout,
    target_modules = linear_layers,
    r=CFG.lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)
peft_config.to_dict()

In [None]:
model.add_adapter(peft_config)
print_trainable_parameters(model)

In [None]:
training_arguments = TrainingArguments(
    output_dir=CFG.output_dir,
    num_train_epochs=CFG.num_train_epochs,
    per_device_train_batch_size=CFG.per_device_train_batch_size,
    per_device_eval_batch_size=CFG.per_device_eval_batch_size,
    gradient_accumulation_steps=CFG.gradient_accumulation_steps,
    optim=CFG.optim,
    learning_rate=CFG.learning_rate,
    weight_decay=CFG.weight_decay,
    fp16=CFG.fp16,
    bf16=CFG.bf16,
    max_grad_norm=CFG.max_grad_norm,
    max_steps=CFG.max_steps,
    warmup_ratio=CFG.warmup_ratio,
    lr_scheduler_type=CFG.lr_scheduler_type,
    save_strategy = 'steps',
    save_steps=CFG.save_steps,
    save_total_limit = 1,
    gradient_checkpointing = True,
    logging_strategy = "steps",
    logging_steps=CFG.logging_steps,
    evaluation_strategy = "steps",
    eval_steps = CFG.logging_steps,
    save_safetensors = True,
    report_to="wandb",
    seed = CFG.seed,
    data_seed = CFG.seed,
    push_to_hub = False,
)

In [None]:
class LLMSampleCB(WandbCallback):
    def __init__(self, trainer, test_dataset, num_samples=10, max_new_tokens=256):
        "A Callback to log samples to a wandb.Table during training"
        super().__init__()

        # Ensure the `log_model` is not passed as a string
        self._log_model = None  # Use None or a proper model object if needed
        
        self.sample_dataset = test_dataset.select(range(num_samples))
        self.model, self.tokenizer = trainer.model, trainer.tokenizer
        
        # Define the generation config
        self.gen_config = transformers.GenerationConfig.from_pretrained(
            trainer.model.name_or_path, max_new_tokens=max_new_tokens
        )

        # Initialize step tracking (to handle logging steps)
        self.step = 1 if not os.path.exists(CFG.report_path) else pd.read_csv(CFG.report_path)['step'].max() + 1

    def generate(self, instruction):
        """Generate a response for the given instruction."""
        tokenized_prompt = self.tokenizer(instruction, return_tensors='pt', padding=True).to('cuda')

        with torch.inference_mode():
            output = self.model.generate(
                **tokenized_prompt,
                generation_config=self.gen_config,
                pad_token_id=self.tokenizer.eos_token_id
            )

        return self.tokenizer.decode(output[0][len(tokenized_prompt[0]):], skip_special_tokens=True)

    def samples_table(self, examples):
        """Create a wandb.Table to store the generations."""
        if self.step in CFG.log_table_steps:
            data = []
            for example in examples:
                prompt = example["text"]
                instruction = prompt.split('[/INST]')[0][6:]
                response = prompt.split('[/INST]')[1][:-5]
                generation = self.generate(instruction=instruction)
                data.append([instruction, response, generation, *list(self.gen_config.to_dict().values())])

            rerport_step_df = pd.DataFrame(data, columns=['instruction', 'response', 'generation'] + list(self.gen_config.to_dict().keys()))
        else:
            rerport_step_df = None

        self.step += 1
        return rerport_step_df

    def on_evaluate(self, args, state, control, **kwargs):
        """Log the wandb.Table after calling trainer.evaluate."""
        super().on_evaluate(args, state, control, **kwargs)
        
        rerport_step_df = self.samples_table(self.sample_dataset)

        # Update the table with predictions in the new step  
        if rerport_step_df is not None:
            if self.step - 1 == 1:
                rerport_step_df['step'] = self.step - 1
                rerport_step_df.to_csv(CFG.report_path, index=False)
                new_report_df = rerport_step_df.copy()
            else:
                previous_report_df = pd.read_csv(CFG.report_path)
                rerport_step_df['step'] = self.step - 1
                new_report_df = pd.concat([previous_report_df, rerport_step_df], ignore_index=True)
                new_report_df.to_csv(CFG.report_path, index=False)

            table = wandb.Table(dataframe=new_report_df, allow_mixed_types=True)
            wandb.log({"sample_predictions": table})

In [None]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset = valid_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=CFG.max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=CFG.packing,
)

In [None]:
wandb_callback = LLMSampleCB(trainer,
                             valid_dataset,
                             num_samples=5,
                             max_new_tokens= 256)

In [None]:
trainer.add_callback(wandb_callback)

In [None]:
train_dataset

In [None]:
print(clr.S + 'All Train Steps = ' + clr.E, len(train_dataset)/CFG.per_device_train_batch_size)
print(clr.S + 'Max Train Steps = ' + clr.E, CFG.max_steps)

In [None]:
if not os.path.exists(CFG.output_dir):
    os.makedirs(CFG.output_dir)