In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
import os, torch, wandb, platform, warnings
from datasets import load_dataset
from trl import SFTTrainer
from huggingface_hub import notebook_login

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]



In [2]:
base_model = "/notebooks/models/checker/models/Mistral-7B-Instruct-v0.2/"

In [3]:
# Load base model(Mistral 7B)
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0}
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
# model.gradient_checkpointing_enable()
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

(True, True)

In [5]:
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
    )
model = get_peft_model(model, peft_config)

In [6]:
training_arguments = TrainingArguments(
    output_dir= "./results",
    num_train_epochs= 120,
    per_device_train_batch_size= 8,
    gradient_accumulation_steps= 2,
    optim = "paged_adamw_8bit",
    save_steps= 5000,
    logging_steps= 30,
    learning_rate= 2e-4,
    weight_decay= 0.001,
    fp16= False,
    bf16= False,
    max_grad_norm= 0.3,
    max_steps= -1,
    warmup_ratio= 0.3,
    group_by_length= True,
    lr_scheduler_type= "constant",
)

In [10]:
import pandas as pd

df = pd.read_csv("/notebooks/ds/data/joined.csv", index_col="index")

B_INST, E_INST = "[INST]", "[/INST]"
def formatPrompt(row):
    raw_instruction = (
f"""Decide if User is requesting for the following Action. Only return true/false
{row["action"]}

{B_INST}{row["input"]}
{E_INST}{row["output"]}"""
    )
    return raw_instruction

df["instruction"] = df.apply(formatPrompt, axis=1)

In [8]:
from datasets import Dataset

dataset = Dataset.from_pandas(df[["instruction", "output"]]).remove_columns(["index"])

In [9]:
dict_split_ds = dataset.train_test_split(test_size=0.1, seed=42)

In [10]:
train_ds = dict_split_ds["train"]
test_ds = dict_split_ds["test"]

In [11]:
train_ds

Dataset({
    features: ['instruction', 'output'],
    num_rows: 162
})

In [12]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_ds,
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="instruction",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)



Map:   0%|          | 0/162 [00:00<?, ? examples/s]



In [13]:
trainer.train()
model.config.use_cache = True
model.eval()

[34m[1mwandb[0m: Currently logged in as: [33mara-klaytn[0m ([33mklayassist[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
30,2.7751
60,0.9712
90,0.1797
120,0.0799
150,0.0637
180,0.0593
210,0.0585
240,0.0552
270,0.0545
300,0.0524


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer): Lin

In [14]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
model_to_save.save_pretrained("/notebooks/output")

In [41]:
row = df.iloc[0]
tokenizer(formatPrompt(row), return_tensors="pt")

{'input_ids': tensor([[    1,  6712,   547,   513,  1247,   349,  2159,   288,   354,   272,
          2296,  9624, 28723,  6352,   604,  1132, 28748,  3952,    13,  6799,
           861,  1264,   345,  3828,  9573,   617,   548,   345,  6518,  1264,
           345,  8398,  2188, 28742, 28713,  7873, 17395,    13,    13, 28792,
         16289, 28793,  5221, 28705, 28774, 28770, 28783, 28781, 16246,   298,
         28705, 28734, 28744,   280, 28754, 28716, 28782,  2576,   410,  1644,
         28734, 28709, 28759, 28729, 28796, 27557, 28774, 28759, 28729,  7406,
         28781, 17900, 28716, 28765,  1724, 28743, 28727, 28783, 28713, 28755,
          1925,  5194, 28750, 28798, 28796, 28768, 28765, 28727, 28765, 16259,
         28750, 28802, 28709, 11661, 28728, 28783, 28737, 28769,  6042,  3124,
         28782, 28824, 28764, 28769, 28796, 28765,    13, 28792, 28748, 16289,
         28793,  6995,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [44]:
text = "adfadgagd"
device = "cuda:0"

row = { "action": "{\"name\":\"Send\",\"description\":\"Transfer an amount of klay/tokens to another account\"}", "input": text, "output": "" }

inputs = tokenizer(formatPrompt(row), return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=50, )

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Decide if User is requesting for the following Action. Only return true/false
{"name":"Send","description":"Transfer an amount of klay/tokens to another account"}

[INST]adfadgagd
[/INST] I


In [35]:
stream("Send 10 tokens to 0x12378126dfdhfyd")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


{'input_ids': tensor([[    1,   733, 16289, 28793, 27332,  2135, 28747,    13,  6039,   547,
           513,  1247,   349,  2159,   288,   354,   272,  2296,  9624, 28723,
          6352,   604,  1132, 28748,  3952,    13, 27332,  9624, 28747,    13,
          6799,   861, 10549,  8363,  5988,  6518, 10549, 20708,   396,  3558,
           302,   446,  7459, 28748, 20228,   298,  1698,  2708, 17395,    13,
         27332,  1247, 28747,    13,  8363, 28705, 28740, 28734, 16246,   298,
         28705, 28734, 28744, 28740, 28750, 28770, 28787, 28783, 28740, 28750,
         28784,  4053, 22410, 28722,  3389,    13,    13, 28792, 28748, 16289,
         28793,     2]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='c

tensor([[    1,   733, 16289, 28793, 27332,  2135, 28747,    13,  6039,   547,
           513,  1247,   349,  2159,   288,   354,   272,  2296,  9624, 28723,
          6352,   604,  1132, 28748,  3952,    13, 27332,  9624, 28747,    13,
          6799,   861, 10549,  8363,  5988,  6518, 10549, 20708,   396,  3558,
           302,   446,  7459, 28748, 20228,   298,  1698,  2708, 17395,    13,
         27332,  1247, 28747,    13,  8363, 28705, 28740, 28734, 16246,   298,
         28705, 28734, 28744, 28740, 28750, 28770, 28787, 28783, 28740, 28750,
         28784,  4053, 22410, 28722,  3389,    13,    13, 28792, 28748, 16289,
         28793,     2,     1]], device='cuda:0')

# Inference

In [54]:
# Load base model(Mistral 7B)
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0}
)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

(True, True)

In [55]:
model.load_adapter("/notebooks/output")

In [58]:
def stream(user_prompt):
    runtimeFlag = "cuda:0"
    input = user_prompt.strip()
    

    
    prompt = formatPrompt({"action":"{\"name\":\"Send\",\"description\":\"Transfer an amount of klay/tokens to another account\"}","input":input, "output":""})
    print(prompt)
    inputs = tokenizer(prompt, return_tensors="pt").to(runtimeFlag)
    print(inputs)

    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=50)
    return _

In [64]:
stream("Can you help me check my balance")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Decide if User is requesting for the following Action. Only return true/false
{"name":"Send","description":"Transfer an amount of klay/tokens to another account"}

[INST]Can you help me check my balance
[/INST]
{'input_ids': tensor([[    1,  6712,   547,   513,  1247,   349,  2159,   288,   354,   272,
          2296,  9624, 28723,  6352,   604,  1132, 28748,  3952,    13,  6799,
           861, 10549,  8363,  5988,  6518, 10549, 20708,   396,  3558,   302,
           446,  7459, 28748, 20228,   298,  1698,  2708, 17395,    13,    13,
         28792, 16289, 28793,  6325,   368,  1316,   528,  1877,   586,  7873,
            13, 28792, 28748, 16289, 28793,     2]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
I'm an AI and don't have the ability to check your balance directly. However

tensor([[    1,  6712,   547,   513,  1247,   349,  2159,   288,   354,   272,
          2296,  9624, 28723,  6352,   604,  1132, 28748,  3952,    13,  6799,
           861, 10549,  8363,  5988,  6518, 10549, 20708,   396,  3558,   302,
           446,  7459, 28748, 20228,   298,  1698,  2708, 17395,    13,    13,
         28792, 16289, 28793,  6325,   368,  1316,   528,  1877,   586,  7873,
            13, 28792, 28748, 16289, 28793,     2, 28737, 28742, 28719,   396,
         16107,   304,   949, 28742, 28707,   506,   272,  5537,   298,  1877,
           574,  7873,  5090, 28723,  2993, 28725,   315,   541,  8327,   368,
           356,   910,   298,  1877,   378,  3936, 28723,    13,    13,  1551,
          1877,   574,  7873, 28725,   368,   541,   938,   272,  3445,  1552,
          5221, 28705, 28774, 28774, 28774, 28734]], device='cuda:0')