In [20]:
%%capture
%pip install -U bitsandbytes
%pip install transformers==4.44.2
%pip install -U accelerate
%pip install -U peft
%pip install -U trl
%pip install -U wandb

In [21]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
login(token = hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [22]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
    EarlyStoppingCallback
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [23]:
wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='ALSATE', 
    job_type="training", 
    anonymous="allow",
    name="ALSATE-v3"
)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [24]:
base_model = "meta-llama/Llama-3.2-3B-Instruct"
new_model = "llama-3.2-3b-sys-log-analysis-alsate-v1"
dataset_name = "k-arthik-r/sys-logs-L0-to-L4-12.6k"

In [25]:
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

In [26]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [27]:
dataset = load_dataset(dataset_name, split="train")
dataset = dataset.shuffle(seed=65)

instruction = """
Your name is "ALSATE",you are an advanced syslog parsing and analysis tool. Your task is to analyze provided system logs, identify potential causes of their generation, and detect any security threats or anomalies. If threats are found, suggest precise remediation steps. Respond only when the input is a valid system log; otherwise, reply with: 'Input does not appear to be a valid system log. Unable to assist.'
"""

def format_chat_template(row):
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": row["Logs"]},
        {"role": "assistant", "content": row["Cause and Remediation"]}
    ]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(format_chat_template, num_proc=4)
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)

In [28]:
dataset['text'][1000]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Nov 2024\n\nYour name is "ALSATE",you are an advanced syslog parsing and analysis tool. Your task is to analyze provided system logs, identify potential causes of their generation, and detect any security threats or anomalies. If threats are found, suggest precise remediation steps. Respond only when the input is a valid system log; otherwise, reply with: \'Input does not appear to be a valid system log. Unable to assist.\'<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nNov 24 00:49:54: Critical system failure: Network driver unable to initialize, system cannot access the network.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nCause - The network driver failed to load, preventing system connectivity.\nRemediation - 1. Reinstall or update the network driver. 2. Check and replace hardware like the network adapter if necessary. 3. Verify network configurat

In [29]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [30]:
tokenizer.chat_template = None

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

In [31]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=2,
    eval_strategy="steps",
    eval_steps=0.05,
    save_steps=0.05,
    save_total_limit=3,
    logging_steps=1,
    warmup_steps=1000,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    max_grad_norm=0.8,
    weight_decay = 0.1,
    load_best_model_at_end=True,
    report_to="wandb"
)

In [32]:
trainer = SFTTrainer(
    model=model,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    peft_config=peft_config,
    max_seq_length= 512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/2536 [00:00<?, ? examples/s]

In [33]:
trainer.train()

Step,Training Loss,Validation Loss
508,0.6688,0.599993
1016,0.4298,0.548614
1524,0.5689,0.50278
2032,0.5097,0.47604
2540,0.4303,0.464909
3048,0.3309,0.445699
3556,0.5066,0.438191
4064,0.1782,0.426397
4572,0.4116,0.413832
5080,0.3494,0.406196


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=10142, training_loss=0.42139538980872054, metrics={'train_runtime': 16559.7318, 'train_samples_per_second': 1.225, 'train_steps_per_second': 0.612, 'total_flos': 6.687393512157389e+16, 'train_loss': 0.42139538980872054, 'epoch': 1.999802819678596})

In [34]:
wandb.finish()

0,1
eval/loss,█▆▅▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁
eval/runtime,▃▄▄▃▃▃▃▇▅▂▂▂▃▁▃█▂▂▆
eval/samples_per_second,▆▅▆▆▆▇▆▂▄▇▇▇▆█▇▁▇▇▄
eval/steps_per_second,▆▅▆▆▆▇▆▂▄▇▇▇▆█▇▁▇▇▄
train/epoch,▁▁▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▆▆▇▇▇▇▇█████
train/grad_norm,█▅▆▆▂▃▃▃▅▅▃▂▂▃▂▂▅▂▁▁▂▃▂▃▃▂▃▁▅▂▄▄▃▃▃▂▃▂▅▂
train/learning_rate,▃▃▃▄▄█████▇▇▇▆▆▆▆▆▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁
train/loss,▆▇▄▆▆▆▅▆▄▄▃▄▄█▄▄▅▃▃▁▅▅▅▃▄▃▄▃▄▃▄▂▆▃▂▁▁▃▁▂

0,1
eval/loss,0.36356
eval/runtime,428.6051
eval/samples_per_second,5.917
eval/steps_per_second,5.917
total_flos,6.687393512157389e+16
train/epoch,1.9998
train/global_step,10142.0
train/grad_norm,0.55762
train/learning_rate,0.0
train/loss,0.21


In [35]:
messages = [{"role": "system", "content": instruction},
    {"role": "user", "content": "Nov 27 20:47:32 ubuntu kernel: Kernel panic: Failed to initialize boot loader, system crash imminent."}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])


Cause - The boot loader failed to initialize, causing a system crash.
Remediation - Reinstall or repair the bootloader. Verify the integrity of the bootloader configuration and files.


In [36]:
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)



adapter_model.safetensors:   0%|          | 0.00/1.67G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/k-arthik-r/llama-3.2-3b-sys-log-analysis-alsate-v1/commit/a6a2feb1422aea50fe48ed77d184a3ed162f4c63', commit_message='Upload model', commit_description='', oid='a6a2feb1422aea50fe48ed77d184a3ed162f4c63', pr_url=None, repo_url=RepoUrl('https://huggingface.co/k-arthik-r/llama-3.2-3b-sys-log-analysis-alsate-v1', endpoint='https://huggingface.co', repo_type='model', repo_id='k-arthik-r/llama-3.2-3b-sys-log-analysis-alsate-v1'), pr_revision=None, pr_num=None)