<a href="https://colab.research.google.com/github/hgudella/hgudella.github.io/blob/master/Chitti.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [2]:
import torch
from unsloth import FastLanguageModel

max_seq_length = 2048     # Unsloth auto supports RoPE Scaling internally!
dtype = None              # None for auto detection
load_in_4bit = False      # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,   # LoRA rank - suggested values: 8, 16, 32, 64, 128
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,   # Supports any, but = 0 is optimized
    bias="none",      # Supports any, but = "none" is optimized
    use_gradient_checkpointing="unsloth",  # Ideal for long context tuning
    random_state=3407,
    use_rslora=False,   # Disable rank-sensitive LoRA for simpler tasks
    loftq_config=None   # No LoftQ, for standard fine-tuning
)

Unsloth 2024.12.4 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [4]:
from datasets import load_dataset

# Loading the dataset
dataset = load_dataset("hgudella/chitti-commands", split="train")

# Selecting a subset of 15K samples for fine-tuning
# dataset = dataset.select(range(15000))
print(f"Using a sample size of {len(dataset)} for fine-tuning.")

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

home-data.csv:   0%|          | 0.00/281k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/222 [00:00<?, ? examples/s]

Using a sample size of 222 for fine-tuning.


In [6]:
from unsloth.chat_templates import get_chat_template

# Initialize the tokenizer with the chat template and mapping
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3",
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    map_eos_token = True,        # Maps <|im_end|> to <|eot_id|> instead
)

In [7]:
dataset[0]

{'query': "Can I adjust the hall light's brightness to 40%?",
 'answers': 'Unfortunately, this feature is not supported by your device at the moment.',
 'tools': "[{'name': 'get_all_devices', 'description': 'Retrives a list of devices in the house.', 'parameters': {'house_id': {'description': 'The UUID of the house.', 'type': 'str', 'default': ''}}}, {'name': 'get_device_by_name', 'description': 'Retrives a device in the house by name.', 'parameters': {'house_name': {'description': 'The name of the house.', 'type': 'str', 'default': ''}}}, {'name': 'get_device_status_by_id', 'description': 'Retrives status the device for given ID.', 'parameters': {'device_id': {'description': 'The ID of the device in house.', 'type': 'str', 'default': ''}}}, {'name': 'turn_on_device', 'description': 'Turns ON the device for given device ID.', 'parameters': {'device_id': {'description': 'The ID of the device in house.', 'type': 'str', 'default': ''}}}, {'name': 'turn_off_device', 'description': 'Turns O

In [8]:
def formatting_prompts_func(examples):
    convos = []

    # Iterate through each item in the batch (examples are structured as lists of values)
    for query, tools, answers in zip(examples['query'], examples['tools'], examples['answers']):
        tool_user = {
            "content": f"You are a helpful assistant with access to the following tools or function calls. Your task is to produce a sequence of tools or function calls necessary to generate response to the user utterance. Use the following tools or function calls as required:\n{tools}",
            "role": "system"
        }
        ques_user = {
            "content": f"{query}",
            "role": "user"
        }
        assistant = {
            "content": f"{answers}",
            "role": "assistant"
        }
        convos.append([tool_user, ques_user, assistant])

    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts}

# Apply the formatting on dataset
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/222 [00:00<?, ? examples/s]

In [11]:
from transformers import TrainingArguments

args = TrainingArguments(
        per_device_train_batch_size = 8,  # Controls the batch size per device
        gradient_accumulation_steps = 2,  # Accumulates gradients to simulate a larger batch
        warmup_steps = 5,
        learning_rate = 2e-4,             # Sets the learning rate for optimization
        num_train_epochs = 3,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        optim = "adamw_8bit",
        weight_decay = 0.01,              # Regularization term for preventing overfitting
        lr_scheduler_type = "linear",     # Chooses a linear learning rate decay
        seed = 3407,
        output_dir = "outputs",
        report_to = "wandb",              # Enables Weights & Biases (W&B) logging
        logging_steps = 1,                # Sets frequency of logging to W&B
        logging_strategy = "steps",       # Logs metrics at each specified step
        save_strategy = "no",
        load_best_model_at_end = True,    # Loads the best model at the end
        save_only_model = False           # Saves entire model, not only weights
    )

In [12]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model = model,
    processing_class = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,        # Can make training 5x faster for short sequences.
    args = args
)

Map (num_proc=2):   0%|          | 0/222 [00:00<?, ? examples/s]

In [13]:
# Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
2.486 GB of memory reserved.


In [14]:
from unsloth import unsloth_train

trainer_stats = unsloth_train(trainer)
print(trainer_stats)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 222 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 2
\        /    Total batch size = 16 | Total steps = 42
 "-____-"     Number of trainable parameters = 11,272,192
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
1,1.3908
2,1.4239
3,1.3162
4,1.2313
5,1.1512
6,1.0705
7,0.9504
8,0.8324
9,0.7495
10,0.6073


TrainOutput(global_step=42, training_loss=0.35937598666974474, metrics={'train_runtime': 322.6809, 'train_samples_per_second': 2.064, 'train_steps_per_second': 0.13, 'total_flos': 1837845970796544.0, 'train_loss': 0.35937598666974474, 'epoch': 3.0})


In [16]:
# Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

322.6809 seconds used for training.
5.38 minutes used for training.
Peak reserved memory = 3.783 GB.
Peak reserved memory for training = 1.297 GB.
Peak reserved memory % of max memory = 25.651 %.
Peak reserved memory for training % of max memory = 8.794 %.


In [22]:
# Local saving
# model.save_pretrained("hgudella/Llama-3.2-1B-Instruct")
# tokenizer.save_pretrained("hgudella/Llama-3.2-1B-Instruct")

# Online saving
model.push_to_hub("hgudella/Chitti-3.2-1B-Instruct", token="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")
tokenizer.push_to_hub("hgudella/Chitti-3.2-1B-Instruct", token="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")

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

Saved model to https://huggingface.co/hgudella/Chitti-3.2-1B-Instruct


README.md:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [23]:
# Merge to 16bit
model.save_pretrained_merged("unsloth/Llama-3.2-1B-Instruct", tokenizer, save_method = "merged_16bit",)
model.push_to_hub_merged("hgudella/Chitti-3.2-1B-Instruct", tokenizer, save_method = "merged_16bit", token="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")

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.5G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.46 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 16/16 [00:00<00:00, 21.85it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving unsloth/Llama-3.2-1B-Instruct/pytorch_model.bin...
Done.


Unsloth: You are pushing to hub, but you passed your HF username = hgudella.
We shall truncate hgudella/Chitti-3.2-1B-Instruct to Chitti-3.2-1B-Instruct


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.45 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 16/16 [00:00<00:00, 42.55it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving Chitti-3.2-1B-Instruct/pytorch_model.bin...


  0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/hgudella/Chitti-3.2-1B-Instruct


In [25]:
from unsloth import FastLanguageModel
from transformers import TextStreamer

max_seq_length = 2048
dtype = None
load_in_4bit = False
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "hgudella/Llama-3.2-1B-Instruct",        # Trained model either locally or from huggingface
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear

In [26]:
available_tools_list = [
  {
    'name': 'get_all_devices',
    'description': 'Retrives a list of devices in the house.',
    'parameters': {
      'house_id': {
        'description': 'The UUID of the house.',
        'type': 'str',
        'default': ''
      }
    }
  },
  {
    'name': 'get_device_by_name',
    'description': 'Retrives a device in the house by name.',
    'parameters': {
      'house_name': {
        'description': 'The name of the house.',
        'type': 'str',
        'default': ''
      }
    }
  },
  {
    'name': 'get_device_status_by_id',
    'description': 'Retrives status the device for given ID.',
    'parameters': {
      'device_id': {
        'description': 'The ID of the device in house.',
        'type': 'str',
        'default': ''
      }
    }
  },
  {
    'name': 'turn_on_device',
    'description': 'Turns ON the device for given device ID.',
    'parameters': {
      'device_id': {
        'description': 'The ID of the device in house.',
        'type': 'str',
        'default': ''
      }
    }
  },
  {
    'name': 'turn_off_device',
    'description': 'Turns OFF the device for given device ID.',
    'parameters': {
      'device_id': {
        'description': 'The ID of the device in house.',
        'type': 'str',
        'default': ''
      }
    }
  }
]

In [31]:
query = "Can you make the hall light brighter?"
chat = [
    {"role":"system","content": f"You are a helpful assistant with access to the following function calls. Your task is handle smart home lights. Use the following function calls as required.\n{available_tools_list}"},
    {"role": "user", "content": query }
]

In [32]:
inputs = tokenizer.apply_chat_template(
    chat,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 1024, use_cache = True)
response = tokenizer.batch_decode(outputs)[0]
print(response)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant with access to the following function calls. Your task is handle smart home lights. Use the following function calls as required.
[{'name': 'get_all_devices', 'description': 'Retrives a list of devices in the house.', 'parameters': {'house_id': {'description': 'The UUID of the house.', 'type':'str', 'default': ''}}}, {'name': 'get_device_by_name', 'description': 'Retrives a device in the house by name.', 'parameters': {'house_name': {'description': 'The name of the house.', 'type':'str', 'default': ''}}}, {'name': 'get_device_status_by_id', 'description': 'Retrives status the device for given ID.', 'parameters': {'device_id': {'description': 'The ID of the device in house.', 'type':'str', 'default': ''}}}, {'name': 'turn_on_device', 'description': 'Turns ON the device for given device ID.', 'parameters': {'device_id': {'description': 'The ID of the device in house.', 'type':'str', 'default': ''}}},