In [None]:
# %pip show bitsandbytes peft accelerate transformers

Name: bitsandbytes
Version: 0.45.0
Summary: k-bit optimizers and matrix multiplication routines.
Home-page: https://github.com/bitsandbytes-foundation/bitsandbytes
Author: Tim Dettmers
Author-email: dettmers@cs.washington.edu
License: MIT
Location: /home/hb/.conda/envs/unsloth_env/lib/python3.10/site-packages
Requires: numpy, torch, typing_extensions
Required-by: unsloth
---
Name: peft
Version: 0.14.0
Summary: Parameter-Efficient Fine-Tuning (PEFT)
Home-page: https://github.com/huggingface/peft
Author: The HuggingFace team
Author-email: benjamin@huggingface.co
License: Apache
Location: /home/hb/.local/lib/python3.10/site-packages
Requires: accelerate, huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch, tqdm, transformers
Required-by: unsloth, unsloth_zoo
---
Name: accelerate
Version: 1.2.1
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: /home/hb/.

In [1]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")

PyTorch version: 2.2.0
CUDA version: 12.1


In [2]:
from unsloth import FastLanguageModel
import torch
from torch import cuda, bfloat16
from datasets import load_dataset
import os

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


Unsloth: OpenAI failed to import - ignoring for now.
🦥 Unsloth Zoo will now patch everything to make training faster!


'cuda'

In [None]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# model_id = "unsloth/Meta-Llama-3.1-8B",

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device)

max_seq_length = 2048
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,
    max_seq_length = max_seq_length,
    dtype = dtype,
    # load_in_4bit = load_in_4bit,
    # token = "meta-llama/Meta-Llama-3.1-8B-Instruct",
)

model.eval()
print(f"Model loaded on {device}")

==((====))==  Unsloth 2024.12.12: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.529 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.2.0. CUDA: 8.6. CUDA Toolkit: 12.1. Triton: 2.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.24. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded on cuda:0


In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.12.12 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [5]:
data = load_dataset("json", data_files="/home/hb/LLM-research/openai/generated_instructions/test_3.json")
data["train"]

Dataset({
    features: ['instruction', 'input', 'output', 'most_similar_instructions', 'avg_similarity_score'],
    num_rows: 2264
})

In [6]:
CUTOFF_LEN = 2048

def generate_prompt(data_point):
    """
    Create the text prompt from your instruction, input, and output fields.
    """
    return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{data_point["instruction"]}

### Input:
{data_point["input"]}

### Response:
{data_point["output"]}"""

def tokenize(prompt, add_eos_token=True):
    """
    Tokenizes the prompt. Optionally pads to max_length=2048 and appends an EOS token.
    Copies input_ids to labels for causal LM.
    """
    # Here, we use padding="max_length" to get uniform-length sequences of 2048.
    # Alternatively, you can use padding=False and rely on a data collator.
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding="max_length",   # or padding=False + data_collator
        return_tensors=None,    # return raw Python lists
    )

    input_ids = result["input_ids"]
    attention_mask = result["attention_mask"]

    # Optionally place an EOS token at the very end if there's room
    if (
        add_eos_token
        and len(input_ids) == CUTOFF_LEN
        and input_ids[-1] != tokenizer.eos_token_id
    ):
        # Replace last token with EOS if you'd like
        input_ids[-1] = tokenizer.eos_token_id
        attention_mask[-1] = 1

    labels = input_ids.copy()
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

def generate_and_tokenize_prompt(data_point):
    """
    Combines prompt generation with tokenization.
    """
    full_prompt = generate_prompt(data_point)
    return tokenize(full_prompt)

# Example: split the "train" set into train/val
train_val = data["train"].train_test_split(test_size=200, shuffle=True, seed=42)
train_data = train_val["train"].map(generate_and_tokenize_prompt)
val_data   = train_val["test"].map(generate_and_tokenize_prompt)


Map: 100%|██████████| 2064/2064 [00:08<00:00, 240.22 examples/s]
Map: 100%|██████████| 200/200 [00:01<00:00, 154.11 examples/s]


In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported


output_dir = "/home/hb/dataset_bgp/BGP-LLaMA3-5k"
per_device_train_batch_size = 1
gradient_accumulation_steps = 1
optim = "paged_adamw_32bit"
save_steps = 200
logging_steps = 500
learning_rate = 1e-4
max_grad_norm = 0.3
max_steps = 5000
warmup_ratio = 0.05
lr_scheduler_type = "cosine"


trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_data,
    eval_dataset = val_data,
    dataset_text_field = "output",
    logging_steps = 200,
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 1,
        # warmup_steps = 5,
        warmup_ratio = 0.05,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 5000,
        learning_rate = 1e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "paged_adamw_32bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "/home/hb/dataset_bgp/BGP-LLaMA3-5k/outputs",
        report_to = "none",
    ),
)


In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,064 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 1
\        /    Total batch size = 4 | Total steps = 5,000
 "-____-"     Number of trainable parameters = 167,772,160
  4%|▍         | 200/5000 [16:15<6:30:48,  4.89s/it]

{'loss': 0.4409, 'grad_norm': 0.1623951494693756, 'learning_rate': 8e-05, 'epoch': 0.39}


  8%|▊         | 400/5000 [32:33<6:14:57,  4.89s/it]

{'loss': 0.1854, 'grad_norm': 0.11636804789304733, 'learning_rate': 9.975414512725057e-05, 'epoch': 0.78}


 12%|█▏        | 600/5000 [48:55<5:58:29,  4.89s/it]

{'loss': 0.1679, 'grad_norm': 0.11436620354652405, 'learning_rate': 9.86663298624003e-05, 'epoch': 1.16}


 16%|█▌        | 800/5000 [1:05:12<5:42:03,  4.89s/it]

{'loss': 0.1564, 'grad_norm': 0.0949527770280838, 'learning_rate': 9.672822322997305e-05, 'epoch': 1.55}


 20%|██        | 1000/5000 [1:21:30<5:25:56,  4.89s/it]

{'loss': 0.1515, 'grad_norm': 0.11936292797327042, 'learning_rate': 9.397368756032445e-05, 'epoch': 1.94}


 24%|██▍       | 1200/5000 [1:38:09<6:07:09,  5.80s/it]

{'loss': 0.1382, 'grad_norm': 0.1101531833410263, 'learning_rate': 9.045084971874738e-05, 'epoch': 2.33}


 28%|██▊       | 1400/5000 [1:55:58<5:14:56,  5.25s/it]

{'loss': 0.134, 'grad_norm': 0.12722229957580566, 'learning_rate': 8.622126023955446e-05, 'epoch': 2.71}


 32%|███▏      | 1600/5000 [2:14:14<5:27:24,  5.78s/it]

{'loss': 0.1287, 'grad_norm': 0.11256109923124313, 'learning_rate': 8.135881792367686e-05, 'epoch': 3.1}


 36%|███▌      | 1800/5000 [2:32:51<5:26:25,  6.12s/it]

{'loss': 0.1176, 'grad_norm': 0.1252177357673645, 'learning_rate': 7.594847868906076e-05, 'epoch': 3.49}


 40%|████      | 2000/5000 [2:50:34<4:07:51,  4.96s/it]

{'loss': 0.1163, 'grad_norm': 0.11842474341392517, 'learning_rate': 7.008477123264848e-05, 'epoch': 3.88}


 44%|████▍     | 2200/5000 [3:06:57<3:48:45,  4.90s/it]

{'loss': 0.1026, 'grad_norm': 0.1291685253381729, 'learning_rate': 6.387014543809223e-05, 'epoch': 4.26}


 48%|████▊     | 2400/5000 [3:23:16<3:32:11,  4.90s/it]

{'loss': 0.0982, 'grad_norm': 0.16952967643737793, 'learning_rate': 5.74131823855921e-05, 'epoch': 4.65}


 52%|█████▏    | 2600/5000 [3:39:39<3:15:37,  4.89s/it]

{'loss': 0.0955, 'grad_norm': 0.1537928730249405, 'learning_rate': 5.0826697238317935e-05, 'epoch': 5.04}


 56%|█████▌    | 2800/5000 [3:55:58<2:59:30,  4.90s/it]

{'loss': 0.0772, 'grad_norm': 0.18353110551834106, 'learning_rate': 4.4225768151520694e-05, 'epoch': 5.43}


 60%|██████    | 3000/5000 [4:12:17<2:43:06,  4.89s/it]

{'loss': 0.0784, 'grad_norm': 0.18229827284812927, 'learning_rate': 3.772572564296005e-05, 'epoch': 5.81}


 64%|██████▍   | 3200/5000 [4:28:41<2:26:43,  4.89s/it]

{'loss': 0.0682, 'grad_norm': 0.15163271129131317, 'learning_rate': 3.144013755408895e-05, 'epoch': 6.2}


 68%|██████▊   | 3400/5000 [4:45:00<2:10:28,  4.89s/it]

{'loss': 0.0599, 'grad_norm': 0.19006147980690002, 'learning_rate': 2.547882480847461e-05, 'epoch': 6.59}


 72%|███████▏  | 3600/5000 [5:01:22<1:54:09,  4.89s/it]

{'loss': 0.06, 'grad_norm': 0.17052960395812988, 'learning_rate': 1.9945942635848748e-05, 'epoch': 6.98}


 76%|███████▌  | 3800/5000 [5:17:40<1:37:51,  4.89s/it]

{'loss': 0.0448, 'grad_norm': 0.14904135465621948, 'learning_rate': 1.4938160786375572e-05, 'epoch': 7.36}


 80%|████████  | 4000/5000 [5:33:58<1:21:32,  4.89s/it]

{'loss': 0.0445, 'grad_norm': 0.14647755026817322, 'learning_rate': 1.0542974530180327e-05, 'epoch': 7.75}


 84%|████████▍ | 4200/5000 [5:51:20<1:08:22,  5.13s/it]

{'loss': 0.0414, 'grad_norm': 0.20893704891204834, 'learning_rate': 6.837175952121306e-06, 'epoch': 8.14}


 88%|████████▊ | 4400/5000 [6:09:07<51:01,  5.10s/it]  

{'loss': 0.0344, 'grad_norm': 0.15667372941970825, 'learning_rate': 3.885512251130763e-06, 'epoch': 8.53}


 92%|█████████▏| 4600/5000 [6:27:01<38:39,  5.80s/it]

{'loss': 0.0345, 'grad_norm': 0.17932827770709991, 'learning_rate': 1.7395544861325718e-06, 'epoch': 8.91}


 96%|█████████▌| 4800/5000 [6:44:57<19:29,  5.85s/it]

{'loss': 0.0315, 'grad_norm': 0.1468893140554428, 'learning_rate': 4.367965336512403e-07, 'epoch': 9.3}


100%|██████████| 5000/5000 [7:03:08<00:00,  5.60s/it]

{'loss': 0.0306, 'grad_norm': 0.183025062084198, 'learning_rate': 0.0, 'epoch': 9.69}


100%|██████████| 5000/5000 [7:03:12<00:00,  5.08s/it]

{'train_runtime': 25392.0246, 'train_samples_per_second': 0.788, 'train_steps_per_second': 0.197, 'train_loss': 0.1055377257347107, 'epoch': 9.69}





In [None]:
new_model = "/home/hb/dataset_bgp/finetuned_models/LLaMA3-8B-instruct-analysis-5k-no_4bit_paged_adam32"

In [12]:
trainer.model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

('/home/hb/dataset_bgp/finetuned_models/LLaMA3-8B-analysis-5k-no_4bit_paged_adam32/tokenizer_config.json',
 '/home/hb/dataset_bgp/finetuned_models/LLaMA3-8B-analysis-5k-no_4bit_paged_adam32/special_tokens_map.json',
 '/home/hb/dataset_bgp/finetuned_models/LLaMA3-8B-analysis-5k-no_4bit_paged_adam32/tokenizer.json')

In [2]:
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
    TextStreamer
)
import torch
from torch import cuda, bfloat16

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map='auto',
)
model = PeftModel.from_pretrained(base_model, new_model)

model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenizer.padding_side = "right"

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.72s/it]
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [None]:
logging.set_verbosity(logging.CRITICAL)

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=1012)
result = pipe(f"Summarize the AS paths for each prefix associated with ASN AS4766 over the period oct 28 13:00 to oct 28 13:15, 2024. Provide minimum, maximum, and median AS path lengths and highlight any significant path changes observed in BGP updates.")
print(result[0]['generated_text'])