# Setup

In [6]:
# install necessary library
# !pip install -q huggingface_hub
# !pip install -q -U trl transformers accelerate peft
# !pip install -q -U datasets bitsandbytes einops wandb
# !pip install  -q ipywidgets
# !pip install -q scipy

In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [2]:
from datasets import load_dataset
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
# The author did not import PeftModel before
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
# 



# Dataset

In [3]:
import tqdm.auto as tqdm
import json
def read_jsonl(path):
    # Manually open because .splitlines is different from iterating over lines
    with open(path, "r") as f:
        for line in f:
            yield json.loads(line)

In [4]:
from datasets import load_dataset

train_dataset = load_dataset('json', data_files='cs_train.jsonl', split='train')
eval_dataset = load_dataset('json', data_files='cs_eval.jsonl', split='train')

In [5]:
def formatting_func(example):
    for key in example:
        if example[key] == '':
            example[key] = 'N/A'
    
    text = f"### Question: A WiFi Inspector collects the following information of a device. The device's OUI belongs to {example['oui_friendly']}.The device's DHCP Hostname is {example['dhcp_hostname']}. The device's Netdisco information is {json.dumps(example['netdisco_info'])}. The device visits domains ({example['domains']}). g Give me the device's vendor name only without explanation\n ### Answer: {example['output']}"
    return [text]

In [6]:
my_dict = {"oui_friendly": "Sonos, Inc.", "dhcp_hostname": "", "domains": "sonos.com, amazonaws.com", "netdisco_info": {"device_type": "sonos", "name": "192.168.85.20 - Sonos Playbar", "model_name": "Sonos Playbar", "manufacturer": "Sonos, Inc."}, "output": "sonos"}

for key in my_dict.keys():
    print(key)

print(formatting_func(my_dict))

oui_friendly
dhcp_hostname
domains
netdisco_info
output
['### Question: A WiFi Inspector collects the following information of a device. The device\'s OUI belongs to Sonos, Inc..The device\'s DHCP Hostname is N/A. The device\'s Netdisco information is {"device_type": "sonos", "name": "192.168.85.20 - Sonos Playbar", "model_name": "Sonos Playbar", "manufacturer": "Sonos, Inc."}. The device visits domains (sonos.com, amazonaws.com). g Give me the device\'s vendor name only without explanation\n ### Answer: sonos']


In [7]:
base_model_name = "llama-2-7b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True
)
base_model.config.use_cache = False

# More info: https://github.com/huggingface/transformers/pull/24906
base_model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
output_dir = "./llama-2-7b-hf-fine-tune-baby"

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    logging_steps=100,
    max_steps=1000,
    logging_dir="./logs",        # Directory for storing logs
    save_strategy="steps",       # Save the model checkpoint every logging step
    save_steps=100,                # Save checkpoints every 50 steps
    evaluation_strategy="steps", # Evaluate the model every logging step
    eval_steps=100,               # Evaluate and save checkpoints every 50 steps
    do_eval=True                 # Perform evaluation at the end of training
)

In [9]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [10]:
max_seq_length = 512
trainer = SFTTrainer(
    model=base_model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    formatting_func=formatting_func,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_args,
)

# pass in resume_from_checkpoint=True to resume from a checkpoint
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mhexplode2021[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [10]:
base_model_name="llama-2-7b-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True
)
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

ValueError: 
                        Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
                        the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
                        these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom
                        `device_map` to `from_pretrained`. Check
                        https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
                        for more details.
                        

In [3]:
model = PeftModel.from_pretrained(base_model, "/media/extend/Programming/Python/llama-2/llama-2-7b-hf-fine-tune-baby/1e-4-4-4/checkpoint-500")

In [7]:
# eval_prompt = \
# """Question: A WiFi Inspector collects the following information of a device. The device\'s OUI belongs to N/A.The device\'s DHCP Hostname is LGwebosTV. The device\'s Netdisco information is {}. The device visits domains (). Which vendor does this device belong to? \n"""
eval_prompt = """
'Cyndi's iPhone'.
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=10)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



'Cyndi's iPhone'.
'Cyndi's iPhone'.
'
