In [None]:
%%capture
!pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git@nightly git+https://github.com/unslothai/unsloth-zoo.git

In [None]:
import os
from huggingface_hub import HfFolder, login

# Detect environment
try:
    IN_KAGGLE = os.path.exists('/kaggle')
    IN_COLAB = False
    if not IN_KAGGLE:# Check if running on Kaggle
        from google.colab import drive, userdata
        IN_COLAB = True
        IN_KAGGLE = False
except:
    pass

if IN_COLAB:
    print("Running in Google Colab")
    drive.mount('/content/drive')
    
    HF_TOKEN = userdata.get('hf_token')
    WANDB_API_KEY = userdata.get('wandb_token')
    output_dir = "/content/drive/MyDrive/scalable_lab2"
elif IN_KAGGLE:
    print("Running on Kaggle")
    from kaggle_secrets import UserSecretsClient
    secrets = UserSecretsClient()
    
    HF_TOKEN = secrets.get_secret('hf_token')
    WANDB_API_KEY = secrets.get_secret('wandb_token')
    output_dir = "/kaggle/working/scalable_lab2"
else:
    print("Running locally")
    from dotenv import load_dotenv
    load_dotenv()
    
    HF_TOKEN = os.getenv("HF_TOKEN")
    WANDB_API_KEY = os.getenv("WANDB_API_KEY")
    output_dir = "scalable_lab2"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
print(f"Output directory is {output_dir}.")


if WANDB_API_KEY:
    wandb.login(key=WANDB_API_KEY)
if HF_TOKEN:
    login(token=HF_TOKEN)
    HfFolder.save_token(HF_TOKEN)

In [None]:
from unsloth import FastLanguageModel
import torch
from unsloth.chat_templates import get_chat_template


max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

def get_model():
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/Llama-3.2-1B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )

    model = FastLanguageModel.get_peft_model(
        model,
        r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj",],
        lora_alpha = 16,
        lora_dropout = 0, # Supports any, but = 0 is optimized
        bias = "none",    # Supports any, but = "none" is optimized
        # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
        use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
        random_state = 3407,
        use_rslora = False,  # We support rank stabilized LoRA
        loftq_config = None, # And LoftQ
    )

    tokenizer = get_chat_template(
        tokenizer,
        chat_template = "llama-3.1",
    )

    return model, tokenizer


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.1.7: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [None]:
def get_dataset():
    from datasets import load_dataset
    from unsloth.chat_templates import standardize_sharegpt

    def formatting_prompts_func(examples):
        convos = examples["conversations"]
        texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
        return { "text" : texts, }
    
    dataset = load_dataset("mlabonne/FineTome-100k", split = "train")
    dataset = standardize_sharegpt(dataset)
    dataset = dataset.map(formatting_prompts_func, batched = True,)

    print(dataset[5]["conversations"])
    print(dataset[5]["text"])

    return dataset  


README.md:   0%|          | 0.00/982 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

We look at how the conversations are structured for item 5:

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [None]:
import multiprocessing
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth.chat_templates import train_on_responses_only
from unsloth import is_bfloat16_supported
import wandb

sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'eval/loss', 'goal': 'minimize'},  # Changed to eval/loss!
    'parameters': {
        'learning_rate': {'min': 5e-6, 'max': 2e-4},  # Lower LR range
        'gradient_accumulation_steps': {'values': [4, 8]},
        'lora_dropout': {'values': [0.05, 0.1]},  # Add dropout
        'weight_decay': {'values': [0.01, 0.05]},  # Add weight decay
        'max_steps': {'value': 1500},
        'output_dir': {'value': output_dir},
    }
}


def run_experiment(params):
    # Start run and log params to W&B
     model, tokenizer = load_model()
     with wandb.init(config=params):
          config = wandb.config   # This now contains params from W&B
          trainer = SFTTrainer(
               model=model,
               tokenizer=tokenizer,
               train_dataset=get_dataset(),
               dataset_text_field="text",
               max_seq_length=max_seq_length,
               data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
               dataset_num_proc=2,
               packing=False,
               args=TrainingArguments(
                    per_device_train_batch_size=2,
                    gradient_accumulation_steps=4,
                    warmup_steps=3,
                    # num_train_epochs=1,
                    max_steps=config.max_steps,
                    learning_rate=config.learning_rate,   
                    fp16=not is_bfloat16_supported(),
                    bf16=is_bfloat16_supported(),
                    logging_steps=10,
                    optim="adamw_8bit",
                    weight_decay=config.weight_decay,  
                    lr_scheduler_type="linear",
                    seed=3407,
                    output_dir="outputs",

                    # save strategy
                    output_dir=config.output_dir,     
                    save_strategy="steps",      
                    save_steps=500,             
                    save_total_limit=3,          
                    save_safetensors=True,
                    report_to="wandb",           
               ),
          )

          trainer = train_on_responses_only(
               trainer,
               instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
               response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
          )

          # verifying that the run worked
          tokenizer.decode(trainer.train_dataset[5]["input_ids"])
          space = tokenizer(" ", add_special_tokens = False).input_ids[0]
          tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

          # show gpu stats
          gpu_stats = torch.cuda.get_device_properties(0)
          start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
          max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
          print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
          print(f"{start_gpu_memory} GB of memory reserved.")


          # start training
          trainer_stats = trainer.train()

          # print gpu training stats
          used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
          used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
          used_percentage = round(used_memory         /max_memory*100, 3)
          lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
          print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
          print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
          print(f"Peak reserved memory = {used_memory} GB.")
          print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
          print(f"Peak reserved memory % of max memory = {used_percentage} %.")
          print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")     

sweep_id = wandb.sweep(sweep=sweep_configuration, project="scalable_lab2")
wandb.agent(sweep_id, function=run_experiment, count=4)