<a href="https://colab.research.google.com/github/jchuang0710/Finetuning-on-ART/blob/main/Finetuning_on_ART.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3

fatal: destination path 'TinyLlama-1.1B-Chat-v0.3' already exists and is not an empty directory.


In [None]:
!pip install -U datasets
!pip install fsspec==2023.9.2
!pip install bitsandbytes



In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import json
import gzip
import random
import torch
from transformers import (
    LlamaTokenizerFast, LlamaForCausalLM,
    Trainer, TrainingArguments, BitsAndBytesConfig
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType

In [None]:
def load_json(file_path):
  with open(file_path, "r", encoding="utf-8") as fp:
    return json.load(fp)

def load_jsonl(file_path):
  data = []
  with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
      data.append(json.loads(line))
  return data

def dump_json_gz(data, file_path):
  with gzip.open(file_path, "wt", encoding="utf-8") as fp:
    json.dump(data, fp, ensure_ascii=False)

In [None]:
full_dataset = load_jsonl("ART.jsonl")  # <<=== 你的全資料

random.seed(42)
random.shuffle(full_dataset)

train = full_dataset[:int(len(full_dataset)*0.8)]
dev = full_dataset[int(len(full_dataset)*0.8):int(len(full_dataset)*0.9)]
test = full_dataset[int(len(full_dataset)*0.9):]

with open("train.json", "w", encoding="utf-8") as f: json.dump(train, f, ensure_ascii=False, indent=2)
with open("dev.json", "w", encoding="utf-8") as f: json.dump(dev, f, ensure_ascii=False, indent=2)
with open("test.json", "w", encoding="utf-8") as f: json.dump(test, f, ensure_ascii=False, indent=2)

In [None]:
model_id = "TinyLlama-1.1B-Chat-v0.3"
tokenizer = LlamaTokenizerFast.from_pretrained(model_id)

PROMPT_TEMPLATE = """Please determine whether these command using which MITRE technique. Just reply most similar technique ID, not to explain.

Command:
{}
Answer:"""

def iter_dataset(file_path):
  data = load_json(file_path)
  for sample in data:
    user_content, assistant_content = "", ""
    for msg in sample["messages"]:
      if msg["role"] == "user":
        user_content = msg["content"]
      elif msg["role"] == "assistant":
        assistant_content = msg["content"]
    yield user_content, assistant_content

def build_prompt(user_content):
    return PROMPT_TEMPLATE.format(user_content)

In [None]:
def tokenize_dataset(file_path, tokenizer):
  ds_tokens = []
  for user_content, assistant_content in iter_dataset(file_path):
    prompt = build_prompt(user_content)
    full_text = f"{prompt} {assistant_content}"
    tokens = tokenizer.encode(full_text) + [tokenizer.eos_token_id]
    ds_tokens.append(tokens)

  maxlen = max(len(seq) for seq in ds_tokens)
  dataset = []
  for tokens in ds_tokens:
    padded = tokens + [tokenizer.eos_token_id] * (maxlen - len(tokens))
    dataset.append({"input_ids": padded, "labels": padded})

  return dataset, maxlen

train_tokens, maxlen_train = tokenize_dataset("train.json", tokenizer)
dev_tokens, maxlen_dev = tokenize_dataset("dev.json", tokenizer)

dump_json_gz(train_tokens, "train.tokens.json.gz")
dump_json_gz(dev_tokens, "dev.tokens.json.gz")

In [None]:
data_files = {
  "train": "train.tokens.json.gz",
  "dev": "dev.tokens.json.gz"
}

dataset = load_dataset("json", data_files=data_files)

In [None]:
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_quant_type="fp4",       # 精度較低但省顯存
  bnb_4bit_use_double_quant=False, # 關閉雙重量化，省顯存
  bnb_4bit_compute_dtype=torch.float16
)

model = LlamaForCausalLM.from_pretrained(
  model_id,
  quantization_config=bnb_config,
  device_map="auto"
)

In [None]:
lora_config = LoraConfig(
  r=8,
  lora_alpha=16,
  lora_dropout=0.1,
  bias="none",
  task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)

In [None]:
output_dir = "Models/TinyLlama-1B-MITRE"
training_args = TrainingArguments(
  output_dir=output_dir,
  per_device_train_batch_size=1,     # batch=1，最小化顯存
  per_device_eval_batch_size=1,
  gradient_accumulation_steps=8,     # 累積8次才更新，等效 batch size=8
  eval_strategy="steps",
  save_strategy="steps",
  eval_steps=50,
  save_steps=50,
  save_total_limit=2,
  num_train_epochs=3,
  fp16=True,                         # 15GB GPU 建議用 fp16
  bf16=False,
  logging_steps=10,
  run_name="TinyLlama-MITRE-QLoRA-15GB"
)

In [None]:
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=dataset["train"],
  eval_dataset=dataset["dev"],
)

trainer.train()
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Currently logged in as: [33mjchuang0710[0m ([33mjchuang0710-national-yang-ming-chiao-tung-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
50,5.6367,3.695029
100,0.3428,0.670105
150,0.2331,0.51962
200,0.2129,0.471136
250,0.1725,0.43145
300,0.1782,0.400026
350,0.1591,0.375649
400,0.1673,0.362508
450,0.1408,0.35464


('Models/TinyLlama-1B-MITRE/tokenizer_config.json',
 'Models/TinyLlama-1B-MITRE/special_tokens_map.json',
 'Models/TinyLlama-1B-MITRE/tokenizer.model',
 'Models/TinyLlama-1B-MITRE/added_tokens.json',
 'Models/TinyLlama-1B-MITRE/tokenizer.json')

In [None]:
from vllm import LLM, SamplingParams

test_prompts, test_labels = [], []
for user_content, assistant_content in iter_dataset("test.json"):
  test_prompts.append(build_prompt(user_content))
  test_labels.append(assistant_content)

llm = LLM(output_dir, dtype="float16")

sampling_params = SamplingParams(
  max_tokens=10,
  temperature=0.0,
)

outputs = llm.generate(test_prompts, sampling_params)

results = []
for out, label in zip(outputs, test_labels):
  pred = out.outputs[0].text.strip()
  results.append(pred == label)

accuracy = sum(results) / len(results)
print(f"Test Accuracy: {accuracy:.2%}")

ModuleNotFoundError: No module named 'vllm'