In [None]:
!pip install -q -U bitsandbytes datasets
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U trl

In [None]:
from datasets import load_dataset
from datasets import Dataset
ds = load_dataset("vilm/OpenOrca-Viet",split="train[:10000]")
ds = ds.train_test_split(test_size=0.01)

In [None]:
ds

In [None]:
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer,GPTQConfig
from transformers import BitsAndBytesConfig
import torch
bnb_config = BitsAndBytesConfig(
    load_in_8bit= True,
    bnb_8bit_quant_type= "nf8",
    bnb_8bit_compute_dtype= torch.bfloat16,
    bnb_8bit_use_double_quant= False,
)
# HuggingFaceH4/zephyr-7b-beta
modelName = "HuggingFaceH4/zephyr-7b-beta" # Or whatever you want in HuggingFace
tokenizer = AutoTokenizer.from_pretrained(modelName)
model = AutoModelForCausalLM.from_pretrained(modelName,load_in_8bit=True,
                                             quantization_config=bnb_config,
                                              torch_dtype=torch.bfloat16,
                                              device_map="auto",
                                              trust_remote_code=True,)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Load tokenizer
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token


In [None]:
def text_formatting(data):
  messages = [
    {
        "role": "system",
        "content": data["instruction"],
    },
    {"role": "user", "content": data["input"]},
    {"role": "assistant", "content": data["output"]},
  ]
  data['chat_format'] = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
  return data

ds["train"] = ds["train"].map(text_formatting)
ds["test"] = ds["test"].map(text_formatting)

In [None]:
print(ds["train"]["chat_format"][5])

In [None]:
# Hàm tính số lượng parameters có thể train được trên model
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
# Set config for LoRA
from peft import LoraConfig, get_peft_model,prepare_model_for_kbit_training
model_lora = prepare_model_for_kbit_training(model)
config = LoraConfig(
      r=8, #attention heads
      lora_alpha=16, #alpha scaling
      lora_dropout=0.05,
      bias="none",
      task_type="CAUSAL_LM",
      target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
)
# Set peft adapter to model
model_lora = get_peft_model(model_lora, config)
print_trainable_parameters(model_lora)

In [None]:
import transformers
from transformers import Trainer,EarlyStoppingCallback,TrainingArguments
from trl import SFTTrainer

#Hyperparamter
training_arguments = TrainingArguments(
      per_device_train_batch_size=24, # batch size
      num_train_epochs=1, # epochs
      gradient_accumulation_steps=2,
      warmup_steps=100,
      save_total_limit=5,
      learning_rate=2e-4,
      fp16=True,
      output_dir='outputs',
      logging_steps=20,
      evaluation_strategy="steps",
      load_best_model_at_end = True,
      )
# Setting sft parameters
trainer = SFTTrainer(
    model=model_lora,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    peft_config=config,
    max_seq_length= 512,
    dataset_text_field="chat_format",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

In [None]:
trainer.evaluate()

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainingEpoch_loss_adam,validationEpoch_loss_adam=[],[]
t = 0
for i in trainer.state.log_history[:-1]:
   if t == 0:
     trainingEpoch_loss_adam.append(i["loss"])
     t=1
   else:
     validationEpoch_loss_adam.append(i["eval_loss"])
     t=0
from matplotlib import pyplot as plt
plt.plot(trainingEpoch_loss_adam, label='train_loss')
plt.plot(validationEpoch_loss_adam,label='val_loss')
plt.legend()
plt.show

In [None]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
import torch
from transformers import pipeline, TextStreamer
from peft import PeftConfig, PeftModel
messages = [
    {
        "role": "system",
        "content": "Bạn là một trợ lý AI giúp mọi người tìm kiếm thông tin.",
    },
    {"role": "user", "content": "Ai là người đầu tiên lên mặt trăng?"},
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer( prompt, return_tensors="pt")
streamer = TextStreamer(tokenizer)
with torch.autocast(device.type):
  outputs = model_lora.generate(input_ids=inputs["input_ids"].to(device),
                                streamer=streamer,
                                max_new_tokens=50,
                                no_repeat_ngram_size=2,
                                num_beams=1,
                                num_return_sequences=1)

In [None]:
!huggingface-cli login --token=<mytoken>

In [None]:
model_name = "vietzephyr-7b-lora-8bit"
HUGGING_FACE_USER_NAME="Apricity0201"
model_lora.push_to_hub(f"{HUGGING_FACE_USER_NAME}/{model_name}", use_auth_token=True)
tokenizer.push_to_hub(f"{HUGGING_FACE_USER_NAME}/{model_name}", use_auth_token=True)

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
model_path = "Apricity0201/vietzephyr-7b-lora-8bit"
config = PeftConfig.from_pretrained(model_path)
model_2 = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    load_in_8bit=True,
    device_map='auto'
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
qa_model = PeftModel.from_pretrained(model_2, model_path)

In [None]:
from transformers import pipeline, TextStreamer
messages = [
    {
        "role": "system",
        "content": "Bạn là một trợ lý AI giúp mọi người tìm kiếm thông tin.",
    },
    {"role": "user", "content": "Ai là người đầu tiên lên mặt trăng?"},
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer( prompt, return_tensors="pt")
streamer = TextStreamer(tokenizer)
with torch.cuda.amp.autocast():
    output_tokens = qa_model.generate(**inputs,streamer=streamer, max_new_tokens=100)