# Instruction Finetuning

In this script, we investigate the usage of finetuning on the *UNSW-NB15* dataset using various preprocessing techniques.

# Key-Value pairs Text Encoding

In [4]:
from datasets import load_dataset, Dataset
Dataset.cleanup_cache_files
from dotenv import load_dotenv
from os import getenv
import os

load_dotenv()
HUGGING_FACE_READ_TOKEN = getenv("HUGGING_FACE_READ_TOKEN")

dataset = load_dataset("arrow", data_dir="NF-UNSW-NB15/", streaming=True,split="train")

classes = dataset.features["chosen"].names
classes

['0', '1']

In [5]:
import gc

import torch
import mlflow
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from trl import ORPOConfig, ORPOTrainer, setup_chat_format

mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
mlflow.set_experiment(experiment_name="Testing finetuning")

<Experiment: artifact_location='mlflow-artifacts:/967304590420382862', creation_time=1715755222120, experiment_id='967304590420382862', last_update_time=1715755222120, lifecycle_stage='active', name='Testing finetuning', tags={}>

In [6]:
if torch.cuda.get_device_capability()[0] >= 8:
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16

In [7]:
# Model
base_model = "meta-llama/Meta-Llama-3-8B-Instruct"
new_model = "OrpoLlama-3-8B"

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, token=HUGGING_FACE_READ_TOKEN)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    attn_implementation=attn_implementation,
    token=HUGGING_FACE_READ_TOKEN
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = prepare_model_for_kbit_training(model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.15s/it]


In [34]:
def format_chat_template(row):
    row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
    row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template
)

In [21]:
dataset_name = "Jetlime/NF-UNSW-NB15"
dataset = load_dataset(dataset_name, split="train")
# dataset = dataset.shuffle(seed=42).select(range(1000))

def format_chat_template(row):
    # row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
    # row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
    # row["chosen"] = str(row["chosen"])
    # row["rejected"] = str(row["rejected"])
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc= os.cpu_count(),
)

dataset.data

Downloading readme: 100%|██████████| 4.84k/4.84k [00:00<00:00, 8.40MB/s]
Downloading data: 100%|██████████| 71.5M/71.5M [00:38<00:00, 1.83MB/s]
Downloading data: 100%|██████████| 7.95M/7.95M [00:05<00:00, 1.40MB/s]
Generating train split: 100%|██████████| 1460806/1460806 [00:00<00:00, 2688909.34 examples/s]
Generating test split: 100%|██████████| 162312/162312 [00:00<00:00, 2763535.32 examples/s]
Map (num_proc=16): 100%|██████████| 1460806/1460806 [00:05<00:00, 244189.26 examples/s]


ConcatenationTable
prompt: string
chosen: string
rejected: string
----
prompt: [["### Question:IPV4_SRC_ADDR: 59.166.0.3, L4_SRC_PORT: 58855, IPV4_DST_ADDR: 149.171.126.6, L4_DST_PORT: 80, PROTOCOL: 6, L7_PROTO: 7.0, IN_BYTES: 1684, OUT_BYTES: 10168, IN_PKTS: 14, OUT_PKTS: 18, TCP_FLAGS: 27, FLOW_DURATION_MILLISECONDS: 1948 ### Answer:","### Question:IPV4_SRC_ADDR: 59.166.0.6, L4_SRC_PORT: 39483, IPV4_DST_ADDR: 149.171.126.7, L4_DST_PORT: 6496, PROTOCOL: 6, L7_PROTO: 0.0, IN_BYTES: 5590, OUT_BYTES: 92028, IN_PKTS: 98, OUT_PKTS: 96, TCP_FLAGS: 27, FLOW_DURATION_MILLISECONDS: 0 ### Answer:","### Question:IPV4_SRC_ADDR: 59.166.0.5, L4_SRC_PORT: 40896, IPV4_DST_ADDR: 149.171.126.8, L4_DST_PORT: 111, PROTOCOL: 17, L7_PROTO: 11.0, IN_BYTES: 568, OUT_BYTES: 320, IN_PKTS: 4, OUT_PKTS: 4, TCP_FLAGS: 0, FLOW_DURATION_MILLISECONDS: 258 ### Answer:","### Question:IPV4_SRC_ADDR: 59.166.0.1, L4_SRC_PORT: 10258, IPV4_DST_ADDR: 149.171.126.9, L4_DST_PORT: 111, PROTOCOL: 17, L7_PROTO: 0.0, IN_BYTES: 56

In [22]:
orpo_args = ORPOConfig(
    learning_rate=8e-6,
    beta=0.1,
    lr_scheduler_type="linear",
    max_length=1024,
    max_prompt_length=512,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    report_to="mlflow",
    output_dir="results"
)

trainer = ORPOTrainer(
    model=model,
    args=orpo_args,
    train_dataset=dataset,
    # eval_dataset=dataset["test"],
    peft_config=peft_config,
    tokenizer=tokenizer,
)
trainer.train()
trainer.save_model(new_model)

Map: 100%|██████████| 1460806/1460806 [18:02<00:00, 1349.78 examples/s]
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss




In [None]:
# Flush memory
del trainer, model
gc.collect()
torch.cuda.empty_cache()

# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
model, tokenizer = setup_chat_format(model, tokenizer)

# Merge adapter with base model
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()