In [1]:
!pip install -U bitsandbytes accelerate transformers peft


Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting accelerate
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.52.3-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=

# Prepare Dataset

In [2]:
from huggingface_hub import login

login(token="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")

# Training Configuration

In [3]:
from google.colab import files
uploaded = files.upload()

Saving tokenized_dataset2.zip to tokenized_dataset2.zip


In [5]:
!unzip tokenized_dataset2.zip

Archive:  tokenized_dataset2.zip
   creating: tokenized_dataset2/
  inflating: tokenized_dataset2/dataset_dict.json  
   creating: tokenized_dataset2/test/
  inflating: tokenized_dataset2/test/data-00000-of-00001.arrow  
  inflating: tokenized_dataset2/test/dataset_info.json  
  inflating: tokenized_dataset2/test/state.json  
   creating: tokenized_dataset2/train/
  inflating: tokenized_dataset2/train/data-00000-of-00001.arrow  
  inflating: tokenized_dataset2/train/dataset_info.json  
  inflating: tokenized_dataset2/train/state.json  


In [33]:
import os
from datasets import load_from_disk

tokenized_dataset_train = load_from_disk("/content/tokenized_dataset2/train")
tokenized_dataset_test = load_from_disk("/content/tokenized_dataset2/test")

In [34]:
print(type(tokenized_dataset_train))


<class 'datasets.arrow_dataset.Dataset'>


**Step1: Model & LoRA Setup**

In [35]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # QLoRA = 4-bit
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="bfloat16"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # adjust for model architecture
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)


**Part 2: Evaluation Metrics (Accuracy & F1)**

In [36]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=-1)

    # Optional: Mask padding tokens
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="macro")
    }


**Part 3: Trainer Setup**

In [39]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./tinyllama-log-classifier",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=2e-4,
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_test,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    compute_metrics=compute_metrics
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


**Train**

In [None]:
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss
10,0.3719
20,0.3171
30,0.3248
40,0.2973
50,0.2645
60,0.267
70,0.2582
80,0.2586
90,0.2454
100,0.2416


Step,Training Loss
10,0.3719
20,0.3171
30,0.3248
40,0.2973
50,0.2645
60,0.267
70,0.2582
80,0.2586
90,0.2454
100,0.2416


# Save the Fine-Tuned Mode

In [None]:
model.save_pretrained("llama-log-classifier")
tokenizer.save_pretrained("llama-log-classifier")
