# End to End finetuning Llama 3.2 3B on custom chat dataset
## using torchtune to finetune, vllm to deploy

In [59]:
!nvidia-smi

Wed Feb 12 18:06:57 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.06              Driver Version: 555.42.06      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  |   00000000:C8:00.0 Off |                    0 |
| N/A   39C    P0             67W /  400W |       4MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [24]:
# !pip install vllm 

In [25]:
# !pip install torchtune torchao bitsandbytes  

In [27]:
# prompt: write code to print the versions of torch, torchvision, torchao, and bitsandbytes.

# !pip install torch tune torchao bitsandbytes

import torch
import torchvision
import torchao
import bitsandbytes as bnb

print("PyTorch version:", torch.__version__)
print("Torchvision version:", torchvision.__version__)
print("Torchao version:", torchao.__version__)
print("Bitsandbytes version:", bnb.__version__)


PyTorch version: 2.5.1+cu124
Torchvision version: 0.20.1+cu124
Torchao version: 0.8.0
Bitsandbytes version: 0.45.2


In [28]:
import psutil

ram = psutil.virtual_memory()
print(f"Total RAM: {ram.total / 1e9:.2f} GB")

import platform

print(f"CPU Architecture: {platform.machine()}")

import torch

gpu_available = torch.cuda.is_available()
print(f"GPU Available: {gpu_available}")

if gpu_available:
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

Total RAM: 1081.44 GB
CPU Architecture: x86_64
GPU Available: True
GPU Name: NVIDIA A100-SXM4-80GB


In [29]:
from datasets import load_dataset

# ds = load_dataset("ruslanmv/ai-medical-chatbot")
ds2 = load_dataset('Open-Orca/SlimOrca-Dedup')
ds2['train'].select(range(1)).to_json("chat_dataset_SlimOrca-Dedup2.json")

  from .autonotebook import tqdm as notebook_tqdm
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 676.17ba/s]


4130

In [31]:
# Example Data 
ds2['train'].select(range(1))[0]

{'conversations': [{'from': 'system',
   'value': 'You are an AI assistant. You will be given a task. You must generate a detailed and long answer.'},
  {'from': 'human',
   'value': 'Write an article based on this "A man has been charged with murder and attempted murder after a woman and the man she was on a date with were stabbed at a restaurant in Sydney, Australia."'},
  {'from': 'gpt',

In [6]:
# create directory to save pretrained and finetuned model
import os

# Define the directory path
directory_path = "/home/Llama-3.2-3B-Instruct/"

# Create the directory if it doesn't exist
os.makedirs(directory_path, exist_ok=True)

print(f"Directory '{directory_path}' is ready.")

Directory '/home/Llama-3.2-3B-Instruct/' is ready.


In [7]:
!tune --help

usage: tune [-h] {download,ls,cp,run,validate} ...

Welcome to the torchtune CLI!

options:
  -h, --help            show this help message and exit

subcommands:
  {download,ls,cp,run,validate}
    download            Download a model from the Hugging Face Hub or Kaggle
                        Model Hub.
    ls                  List all built-in recipes and configs
    cp                  Copy a built-in recipe or config to a local path.
    run                 Run a recipe. For distributed recipes, this supports
                        all torchrun arguments.
    validate            Validate a config and ensure that it is well-formed.


In [9]:
# download pretrained model
# !tune download meta-llama/Llama-3.2-3B-Instruct \
#   --output-dir /home/Llama-3.2-3B-Instruct/ \
#   --hf-token <HF_TOKRN>

# Finetune Config

In [35]:
%%writefile /home/finetune-cfg-cpu.yaml
output_dir: /home/torchtune/llama3_2_3B/full_single_device_cpu # /tmp may be deleted by your system. Change it to your preference.

# Tokenizer
tokenizer:
  _component_: torchtune.models.llama3.llama3_tokenizer
  path: /home/Llama-3.2-3B-Instruct/original/tokenizer.model
  max_seq_len: null

# Dataset

dataset:
  _component_: torchtune.datasets.chat_dataset
  source: json
  conversation_column: conversations
  conversation_style: sharegpt
  data_files: /home/chat_dataset_SlimOrca-Dedup2.json
  split: train

# dataset:
#   _component_: torchtune.datasets.alpaca_dataset
#   packed: False  # True increases speed
seed: null
shuffle: True

# Model Arguments
model:
  _component_: torchtune.models.llama3_2.llama3_2_3b

checkpointer:
  _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /home/Llama-3.2-3B-Instruct/
  checkpoint_files: [
    model-00001-of-00002.safetensors,
    model-00002-of-00002.safetensors,
  ]
  recipe_checkpoint: null
  output_dir: ${output_dir}
  model_type: LLAMA3_2
resume_from_checkpoint: False

# Fine-tuning arguments
batch_size: 1
epochs: 1
optimizer:
  _component_: bitsandbytes.optim.PagedAdamW8bit
  lr: 2e-5
loss:
  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1  # Use to increase effective batch size
optimizer_in_bwd: True  # True saves memory. Requires gradient_accumulation_steps=1
clip_grad_norm: null
compile: False  # torch.compile the model + loss, True increases speed + decreases memory

# Training environment
device: cpu

# Memory management
enable_activation_checkpointing: True  # True reduces memory
enable_activation_offloading: False  # True reduces memory

# Reduced precision
dtype: fp32

# Logging
metric_logger:
  _component_: torchtune.training.metric_logging.DiskLogger
  log_dir: ${output_dir}/logs
log_every_n_steps: 1
log_peak_memory_stats: True


# Profiler (disabled)
profiler:
  _component_: torchtune.training.setup_torch_profiler
  enabled: False

  #Output directory of trace artifacts
  output_dir: ${output_dir}/profiling_outputs

  #`torch.profiler.ProfilerActivity` types to trace
  cpu: True
  cuda: True

  #trace options passed to `torch.profiler.profile`
  profile_memory: False
  with_stack: False
  record_shapes: True
  with_flops: False

  # `torch.profiler.schedule` options:
  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
  wait_steps: 5
  warmup_steps: 3
  active_steps: 2
  num_cycles: 1

Overwriting /home/finetune-cfg-cpu.yaml


# To run on GPU

```bash
%%writefile /home/finetune-cfg.yaml
output_dir: /home/torchtune/llama3_2_3B/full_single_device # /tmp may be deleted by your system. Change it to your preference.

# Tokenizer
tokenizer:
  _component_: torchtune.models.llama3.llama3_tokenizer
  path: /home/Llama-3.2-3B-Instruct/original/tokenizer.model
  max_seq_len: null

# Dataset

dataset:
  _component_: torchtune.datasets.chat_dataset
  source: json
  conversation_column: conversations
  conversation_style: sharegpt
  data_files: /home/chat_dataset_SlimOrca-Dedup2.json
  split: train

# dataset:
#   _component_: torchtune.datasets.alpaca_dataset
#   packed: False  # True increases speed
seed: null
shuffle: True

# Model Arguments
model:
  _component_: torchtune.models.llama3_2.llama3_2_3b

checkpointer:
  _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /home/Llama-3.2-3B-Instruct/
  checkpoint_files: [
    model-00001-of-00002.safetensors,
    model-00002-of-00002.safetensors,
  ]
  recipe_checkpoint: null
  output_dir: ${output_dir}
  model_type: LLAMA3_2
resume_from_checkpoint: False

# Fine-tuning arguments
batch_size: 1
epochs: 1
optimizer:
  _component_: bitsandbytes.optim.PagedAdamW8bit
  lr: 2e-5
loss:
  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1  # Use to increase effective batch size
optimizer_in_bwd: True  # True saves memory. Requires gradient_accumulation_steps=1
clip_grad_norm: null
compile: False  # torch.compile the model + loss, True increases speed + decreases memory

# Training environment
device: cuda

# Memory management
enable_activation_checkpointing: True  # True reduces memory
enable_activation_offloading: False  # True reduces memory

# Reduced precision
dtype: bf16

# Logging
metric_logger:
  _component_: torchtune.training.metric_logging.DiskLogger
  log_dir: ${output_dir}/logs
log_every_n_steps: 1
log_peak_memory_stats: True


# Profiler (disabled)
profiler:
  _component_: torchtune.training.setup_torch_profiler
  enabled: False

  #Output directory of trace artifacts
  output_dir: ${output_dir}/profiling_outputs

  #`torch.profiler.ProfilerActivity` types to trace
  cpu: True
  cuda: True

  #trace options passed to `torch.profiler.profile`
  profile_memory: False
  with_stack: False
  record_shapes: True
  with_flops: False

  # `torch.profiler.schedule` options:
  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
  wait_steps: 5
  warmup_steps: 3
  active_steps: 2
  num_cycles: 1
```

# Train

In [42]:
#cpu
# finetune-cfg-cpu.yam
!tune run full_finetune_single_device --config /home/finetune-cfg-cpu.yaml checkpointer.checkpoint_dir=/home/Llama-3.2-3B-Instruct/

INFO:torchtune.utils._logging:Running FullFinetuneRecipeSingleDevice with resolved config:

batch_size: 1
checkpointer:
  _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /home/Llama-3.2-3B-Instruct/
  checkpoint_files:
  - model-00001-of-00002.safetensors
  - model-00002-of-00002.safetensors
  model_type: LLAMA3_2
  output_dir: /home/torchtune/llama3_2_3B/full_single_device_cpu
  recipe_checkpoint: null
clip_grad_norm: null
compile: false
dataset:
  _component_: torchtune.datasets.chat_dataset
  conversation_column: conversations
  conversation_style: sharegpt
  data_files: /home/chat_dataset_SlimOrca-Dedup2.json
  source: json
  split: train
device: cpu
dtype: fp32
enable_activation_checkpointing: true
enable_activation_offloading: false
epochs: 1
gradient_accumulation_steps: 1
log_every_n_steps: 1
log_peak_memory_stats: true
loss:
  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
metric_logger:
  _component_: torchtune.t

```bash
#GPU
!tune run full_finetune_single_device --config /home/finetune-cfg.yaml checkpointer.checkpoint_dir=/home/Llama-3.2-3B-Instruct/
```

# Test Generation of finetuned model

In [16]:
# !tree -a /home/torchtune/llama3_2_3B/full_single_device/

In [20]:
%%writefile /home/cfg_gen.yaml
output_dir: ./ # Not needed

# Model arguments
model:
  _component_: torchtune.models.llama3_2.llama3_2_3b

checkpointer:
  _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /home/torchtune/llama3_2_3B/full_single_device/epoch_0/
  checkpoint_files: [
    ft-model-00001-of-00002.safetensors,
    ft-model-00002-of-00002.safetensors,
  ]
  output_dir: ${output_dir}
  model_type: LLAMA3_2

device: cuda
dtype: bf16

seed: 1234

# Tokenizer arguments
tokenizer:
  _component_: torchtune.models.llama3.llama3_tokenizer
  path: /home/Llama-3.2-3B-Instruct/original/tokenizer.model
  max_seq_len: null
  prompt_template: null

# Generation arguments; defaults taken from gpt-fast
prompt:
  system: null
  user: "Tell me a joke."
max_new_tokens: 300
temperature: 0.6 # 0.8 and 0.6 are popular values to try
top_k: 300

enable_kv_cache: True

quantizer: null

Overwriting /home/cfg_gen.yaml


In [1]:
!tune run generate --config /home/cfg_gen.yaml

# Upload to huggingface

In [2]:
!huggingface-cli upload mendeza/Llama-3.2-3B-Instruct /home/torchtune/llama3_2_3B/full_single_device/epoch_0/  --token <HF_TOKEN>

# Test loading and running with VLLM

In [1]:
# del llm

In [3]:
from vllm import LLM, SamplingParams

def print_outputs(outputs):
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    print("-" * 80)

#TODO: update it to your chosen epoch
llm = LLM(
    model="/home/torchtune/llama3_2_3B/full_single_device/epoch_0/",
    load_format="safetensors",
    kv_cache_dtype="auto",
    max_model_len=4096
)
sampling_params = SamplingParams(max_tokens=16, temperature=0.5)

conversation = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Hello"},
    {"role": "assistant", "content": "Hello! How can I assist you today?"},
    {
        "role": "user",
        "content": "Write an essay about the importance of higher education.",
    },
]
outputs = llm.chat(conversation, sampling_params=sampling_params, use_tqdm=False)
print_outputs(outputs)


# Deploy VLLM Server

```bash
python -m vllm.entrypoints.openai.api_server --model /home/torchtune/llama3_2_3B/full_single_device/epoch_0/
```

In [4]:
!pip install openai

In [5]:
from openai import OpenAI

# Set the API key (can be an empty string for local vLLM) and base URL
openai_api_key = "EMPTY"
openai_api_base = "http://127.0.0.1:8000/v1"

# Create an OpenAI client
client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

# Example: Create a completion
completion = client.completions.create(
    model="/home/torchtune/llama3_2_3B/full_single_device/epoch_0/", # Replace with your model if needed
    prompt="How to implement binary cross entropy using torch primitves in python",
    max_tokens=1024
)

print("Completion result:", completion.choices[0].text)

In [37]:
# streaming

In [6]:
import asyncio
import nest_asyncio
from openai import AsyncOpenAI

# Apply nest_asyncio to avoid "RuntimeError: asyncio.run()..."
nest_asyncio.apply()

# Set API credentials
openai_api_key = "EMPTY"
openai_api_base = "http://127.0.0.1:8000/v1"

# Create an async OpenAI client
client = AsyncOpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

# Async function to stream completions
async def stream_completion():
    response = await client.completions.create(
        model="/home/torchtune/llama3_2_3B/full_single_device/epoch_0/",
        prompt="How to implement binary cross entropy using torch primitives in Python",
        max_tokens=1024,
        stream=True,  # Enable streaming,
        top_p=1,
        temperature=0.1
    )

    print("Completion result:", end=" ", flush=True)
    
    async for chunk in response:
        if chunk.choices and chunk.choices[0].text:
            print(chunk.choices[0].text, end="", flush=True)  # Stream output

# Run the async function properly in Jupyter
await stream_completion()
